4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include <lustre/ll_fiemap.h>
49 #include <lustre_ioctl.h>
51 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
59 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
62 static enum llioc_iter
63 ll_iocontrol_call(struct inode *inode, struct file *file,
64 unsigned int cmd, unsigned long arg, int *rcp);
66 static struct ll_file_data *ll_file_data_get(void)
68 struct ll_file_data *fd;
70 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
74 fd->fd_write_failed = false;
79 static void ll_file_data_put(struct ll_file_data *fd)
82 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
85 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
86 struct lustre_handle *fh)
88 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
89 op_data->op_attr.ia_mode = inode->i_mode;
90 op_data->op_attr.ia_atime = inode->i_atime;
91 op_data->op_attr.ia_mtime = inode->i_mtime;
92 op_data->op_attr.ia_ctime = inode->i_ctime;
93 op_data->op_attr.ia_size = i_size_read(inode);
94 op_data->op_attr_blocks = inode->i_blocks;
95 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
97 op_data->op_handle = *fh;
98 op_data->op_capa1 = ll_mdscapa_get(inode);
100 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
101 op_data->op_bias |= MDS_DATA_MODIFIED;
105 * Packs all the attributes into @op_data for the CLOSE rpc.
107 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
108 struct obd_client_handle *och)
112 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
113 ATTR_MTIME | ATTR_MTIME_SET |
114 ATTR_CTIME | ATTR_CTIME_SET;
116 if (!(och->och_flags & FMODE_WRITE))
119 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
122 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
123 ll_prep_md_op_data(op_data, inode, NULL, NULL,
124 0, 0, LUSTRE_OPC_ANY, NULL);
128 static int ll_close_inode_openhandle(struct obd_export *md_exp,
130 struct obd_client_handle *och,
131 const __u64 *data_version)
133 struct obd_export *exp = ll_i2mdexp(inode);
134 struct md_op_data *op_data;
135 struct ptlrpc_request *req = NULL;
136 struct obd_device *obd = class_exp2obd(exp);
142 * XXX: in case of LMV, is this correct to access
145 CERROR("Invalid MDC connection handle "LPX64"\n",
146 ll_i2mdexp(inode)->exp_handle.h_cookie);
150 OBD_ALLOC_PTR(op_data);
152 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
154 ll_prepare_close(inode, op_data, och);
155 if (data_version != NULL) {
156 /* Pass in data_version implies release. */
157 op_data->op_bias |= MDS_HSM_RELEASE;
158 op_data->op_data_version = *data_version;
159 op_data->op_lease_handle = och->och_lease_handle;
160 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
163 rc = md_close(md_exp, op_data, och->och_mod, &req);
165 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
166 ll_i2mdexp(inode)->exp_obd->obd_name,
167 PFID(ll_inode2fid(inode)), rc);
170 /* DATA_MODIFIED flag was successfully sent on close, cancel data
171 * modification flag. */
172 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
173 struct ll_inode_info *lli = ll_i2info(inode);
175 spin_lock(&lli->lli_lock);
176 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
177 spin_unlock(&lli->lli_lock);
180 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
181 struct mdt_body *body;
182 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
183 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
187 ll_finish_md_op_data(op_data);
191 md_clear_open_replay_data(md_exp, och);
192 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
195 if (req) /* This is close request */
196 ptlrpc_req_finished(req);
200 int ll_md_real_close(struct inode *inode, fmode_t fmode)
202 struct ll_inode_info *lli = ll_i2info(inode);
203 struct obd_client_handle **och_p;
204 struct obd_client_handle *och;
209 if (fmode & FMODE_WRITE) {
210 och_p = &lli->lli_mds_write_och;
211 och_usecount = &lli->lli_open_fd_write_count;
212 } else if (fmode & FMODE_EXEC) {
213 och_p = &lli->lli_mds_exec_och;
214 och_usecount = &lli->lli_open_fd_exec_count;
216 LASSERT(fmode & FMODE_READ);
217 och_p = &lli->lli_mds_read_och;
218 och_usecount = &lli->lli_open_fd_read_count;
221 mutex_lock(&lli->lli_och_mutex);
222 if (*och_usecount > 0) {
223 /* There are still users of this handle, so skip
225 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
234 /* There might be a race and this handle may already
236 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
243 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
246 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
247 struct ll_inode_info *lli = ll_i2info(inode);
251 /* clear group lock, if present */
252 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
253 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
255 if (fd->fd_lease_och != NULL) {
258 /* Usually the lease is not released when the
259 * application crashed, we need to release here. */
260 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
261 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
262 PFID(&lli->lli_fid), rc, lease_broken);
264 fd->fd_lease_och = NULL;
267 if (fd->fd_och != NULL) {
268 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
273 /* Let's see if we have good enough OPEN lock on the file and if
274 we can skip talking to MDS */
275 if (file->f_dentry->d_inode) { /* Can this ever be false? */
277 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
278 struct lustre_handle lockh;
279 struct inode *inode = file->f_dentry->d_inode;
280 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
282 mutex_lock(&lli->lli_och_mutex);
283 if (fd->fd_omode & FMODE_WRITE) {
285 LASSERT(lli->lli_open_fd_write_count);
286 lli->lli_open_fd_write_count--;
287 } else if (fd->fd_omode & FMODE_EXEC) {
289 LASSERT(lli->lli_open_fd_exec_count);
290 lli->lli_open_fd_exec_count--;
293 LASSERT(lli->lli_open_fd_read_count);
294 lli->lli_open_fd_read_count--;
296 mutex_unlock(&lli->lli_och_mutex);
298 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
299 LDLM_IBITS, &policy, lockmode,
301 rc = ll_md_real_close(file->f_dentry->d_inode,
305 CERROR("released file has negative dentry: file = %p, "
306 "dentry = %p, name = %s\n",
307 file, file->f_dentry, file->f_dentry->d_name.name);
311 LUSTRE_FPRIVATE(file) = NULL;
312 ll_file_data_put(fd);
313 ll_capa_close(inode);
318 /* While this returns an error code, fput() the caller does not, so we need
319 * to make every effort to clean up all of our state here. Also, applications
320 * rarely check close errors and even if an error is returned they will not
321 * re-try the close call.
323 int ll_file_release(struct inode *inode, struct file *file)
325 struct ll_file_data *fd;
326 struct ll_sb_info *sbi = ll_i2sbi(inode);
327 struct ll_inode_info *lli = ll_i2info(inode);
331 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
332 PFID(ll_inode2fid(inode)), inode);
334 #ifdef CONFIG_FS_POSIX_ACL
335 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
336 inode == inode->i_sb->s_root->d_inode) {
337 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
340 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
341 fd->fd_flags &= ~LL_FILE_RMTACL;
342 rct_del(&sbi->ll_rct, current_pid());
343 et_search_free(&sbi->ll_et, current_pid());
348 if (inode->i_sb->s_root != file->f_dentry)
349 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
350 fd = LUSTRE_FPRIVATE(file);
353 /* The last ref on @file, maybe not the the owner pid of statahead,
354 * because parent and child process can share the same file handle. */
355 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
356 ll_deauthorize_statahead(inode, fd);
358 if (inode->i_sb->s_root == file->f_dentry) {
359 LUSTRE_FPRIVATE(file) = NULL;
360 ll_file_data_put(fd);
364 if (!S_ISDIR(inode->i_mode)) {
365 if (lli->lli_clob != NULL)
366 lov_read_and_clear_async_rc(lli->lli_clob);
367 lli->lli_async_rc = 0;
370 rc = ll_md_close(sbi->ll_md_exp, inode, file);
372 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
373 libcfs_debug_dumplog();
378 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
379 struct lookup_intent *itp)
381 struct dentry *de = file->f_dentry;
382 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
383 struct dentry *parent = de->d_parent;
384 const char *name = NULL;
386 struct md_op_data *op_data;
387 struct ptlrpc_request *req = NULL;
391 LASSERT(parent != NULL);
392 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
394 /* if server supports open-by-fid, or file name is invalid, don't pack
395 * name in open request */
396 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
397 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
398 name = de->d_name.name;
399 len = de->d_name.len;
402 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
403 name, len, 0, LUSTRE_OPC_ANY, NULL);
405 RETURN(PTR_ERR(op_data));
406 op_data->op_data = lmm;
407 op_data->op_data_size = lmmsize;
409 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
410 &ll_md_blocking_ast, 0);
411 ll_finish_md_op_data(op_data);
413 /* reason for keep own exit path - don`t flood log
414 * with messages with -ESTALE errors.
416 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
417 it_open_error(DISP_OPEN_OPEN, itp))
419 ll_release_openhandle(de, itp);
423 if (it_disposition(itp, DISP_LOOKUP_NEG))
424 GOTO(out, rc = -ENOENT);
426 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
427 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
428 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
432 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
433 if (!rc && itp->d.lustre.it_lock_mode)
434 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
437 ptlrpc_req_finished(req);
438 ll_intent_drop_lock(itp);
443 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
444 struct obd_client_handle *och)
446 struct ptlrpc_request *req = it->d.lustre.it_data;
447 struct mdt_body *body;
449 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
450 och->och_fh = body->mbo_handle;
451 och->och_fid = body->mbo_fid1;
452 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
453 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
454 och->och_flags = it->it_flags;
456 return md_set_open_replay_data(md_exp, och, it);
459 static int ll_local_open(struct file *file, struct lookup_intent *it,
460 struct ll_file_data *fd, struct obd_client_handle *och)
462 struct inode *inode = file->f_dentry->d_inode;
465 LASSERT(!LUSTRE_FPRIVATE(file));
472 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
477 LUSTRE_FPRIVATE(file) = fd;
478 ll_readahead_init(inode, &fd->fd_ras);
479 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
481 /* ll_cl_context initialize */
482 rwlock_init(&fd->fd_lock);
483 INIT_LIST_HEAD(&fd->fd_lccs);
488 /* Open a file, and (for the very first open) create objects on the OSTs at
489 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
490 * creation or open until ll_lov_setstripe() ioctl is called.
492 * If we already have the stripe MD locally then we don't request it in
493 * md_open(), by passing a lmm_size = 0.
495 * It is up to the application to ensure no other processes open this file
496 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
497 * used. We might be able to avoid races of that sort by getting lli_open_sem
498 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
499 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
501 int ll_file_open(struct inode *inode, struct file *file)
503 struct ll_inode_info *lli = ll_i2info(inode);
504 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
505 .it_flags = file->f_flags };
506 struct obd_client_handle **och_p = NULL;
507 __u64 *och_usecount = NULL;
508 struct ll_file_data *fd;
512 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
513 PFID(ll_inode2fid(inode)), inode, file->f_flags);
515 it = file->private_data; /* XXX: compat macro */
516 file->private_data = NULL; /* prevent ll_local_open assertion */
518 fd = ll_file_data_get();
520 GOTO(out_openerr, rc = -ENOMEM);
523 if (S_ISDIR(inode->i_mode))
524 ll_authorize_statahead(inode, fd);
526 if (inode->i_sb->s_root == file->f_dentry) {
527 LUSTRE_FPRIVATE(file) = fd;
531 if (!it || !it->d.lustre.it_disposition) {
532 /* Convert f_flags into access mode. We cannot use file->f_mode,
533 * because everything but O_ACCMODE mask was stripped from
535 if ((oit.it_flags + 1) & O_ACCMODE)
537 if (file->f_flags & O_TRUNC)
538 oit.it_flags |= FMODE_WRITE;
540 /* kernel only call f_op->open in dentry_open. filp_open calls
541 * dentry_open after call to open_namei that checks permissions.
542 * Only nfsd_open call dentry_open directly without checking
543 * permissions and because of that this code below is safe. */
544 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
545 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
547 /* We do not want O_EXCL here, presumably we opened the file
548 * already? XXX - NFS implications? */
549 oit.it_flags &= ~O_EXCL;
551 /* bug20584, if "it_flags" contains O_CREAT, the file will be
552 * created if necessary, then "IT_CREAT" should be set to keep
553 * consistent with it */
554 if (oit.it_flags & O_CREAT)
555 oit.it_op |= IT_CREAT;
561 /* Let's see if we have file open on MDS already. */
562 if (it->it_flags & FMODE_WRITE) {
563 och_p = &lli->lli_mds_write_och;
564 och_usecount = &lli->lli_open_fd_write_count;
565 } else if (it->it_flags & FMODE_EXEC) {
566 och_p = &lli->lli_mds_exec_och;
567 och_usecount = &lli->lli_open_fd_exec_count;
569 och_p = &lli->lli_mds_read_och;
570 och_usecount = &lli->lli_open_fd_read_count;
573 mutex_lock(&lli->lli_och_mutex);
574 if (*och_p) { /* Open handle is present */
575 if (it_disposition(it, DISP_OPEN_OPEN)) {
576 /* Well, there's extra open request that we do not need,
577 let's close it somehow. This will decref request. */
578 rc = it_open_error(DISP_OPEN_OPEN, it);
580 mutex_unlock(&lli->lli_och_mutex);
581 GOTO(out_openerr, rc);
584 ll_release_openhandle(file->f_dentry, it);
588 rc = ll_local_open(file, it, fd, NULL);
591 mutex_unlock(&lli->lli_och_mutex);
592 GOTO(out_openerr, rc);
595 LASSERT(*och_usecount == 0);
596 if (!it->d.lustre.it_disposition) {
597 /* We cannot just request lock handle now, new ELC code
598 means that one of other OPEN locks for this file
599 could be cancelled, and since blocking ast handler
600 would attempt to grab och_mutex as well, that would
601 result in a deadlock */
602 mutex_unlock(&lli->lli_och_mutex);
604 * Normally called under two situations:
606 * 2. A race/condition on MDS resulting in no open
607 * handle to be returned from LOOKUP|OPEN request,
608 * for example if the target entry was a symlink.
610 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
612 * Always specify MDS_OPEN_BY_FID because we don't want
613 * to get file with different fid.
615 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
616 rc = ll_intent_file_open(file, NULL, 0, it);
618 GOTO(out_openerr, rc);
622 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
624 GOTO(out_och_free, rc = -ENOMEM);
628 /* md_intent_lock() didn't get a request ref if there was an
629 * open error, so don't do cleanup on the request here
631 /* XXX (green): Should not we bail out on any error here, not
632 * just open error? */
633 rc = it_open_error(DISP_OPEN_OPEN, it);
635 GOTO(out_och_free, rc);
637 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
638 "inode %p: disposition %x, status %d\n", inode,
639 it_disposition(it, ~0), it->d.lustre.it_status);
641 rc = ll_local_open(file, it, fd, *och_p);
643 GOTO(out_och_free, rc);
645 mutex_unlock(&lli->lli_och_mutex);
648 /* Must do this outside lli_och_mutex lock to prevent deadlock where
649 different kind of OPEN lock for this same inode gets cancelled
650 by ldlm_cancel_lru */
651 if (!S_ISREG(inode->i_mode))
652 GOTO(out_och_free, rc);
656 cl_lov_delay_create_clear(&file->f_flags);
657 GOTO(out_och_free, rc);
661 if (och_p && *och_p) {
662 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
663 *och_p = NULL; /* OBD_FREE writes some magic there */
666 mutex_unlock(&lli->lli_och_mutex);
669 if (lli->lli_opendir_key == fd)
670 ll_deauthorize_statahead(inode, fd);
672 ll_file_data_put(fd);
674 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
677 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
678 ptlrpc_req_finished(it->d.lustre.it_data);
679 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
685 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
686 struct ldlm_lock_desc *desc, void *data, int flag)
689 struct lustre_handle lockh;
693 case LDLM_CB_BLOCKING:
694 ldlm_lock2handle(lock, &lockh);
695 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
697 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
701 case LDLM_CB_CANCELING:
709 * Acquire a lease and open the file.
711 static struct obd_client_handle *
712 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
715 struct lookup_intent it = { .it_op = IT_OPEN };
716 struct ll_sb_info *sbi = ll_i2sbi(inode);
717 struct md_op_data *op_data;
718 struct ptlrpc_request *req = NULL;
719 struct lustre_handle old_handle = { 0 };
720 struct obd_client_handle *och = NULL;
725 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
726 RETURN(ERR_PTR(-EINVAL));
729 struct ll_inode_info *lli = ll_i2info(inode);
730 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
731 struct obd_client_handle **och_p;
734 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
735 RETURN(ERR_PTR(-EPERM));
737 /* Get the openhandle of the file */
739 mutex_lock(&lli->lli_och_mutex);
740 if (fd->fd_lease_och != NULL) {
741 mutex_unlock(&lli->lli_och_mutex);
745 if (fd->fd_och == NULL) {
746 if (file->f_mode & FMODE_WRITE) {
747 LASSERT(lli->lli_mds_write_och != NULL);
748 och_p = &lli->lli_mds_write_och;
749 och_usecount = &lli->lli_open_fd_write_count;
751 LASSERT(lli->lli_mds_read_och != NULL);
752 och_p = &lli->lli_mds_read_och;
753 och_usecount = &lli->lli_open_fd_read_count;
755 if (*och_usecount == 1) {
762 mutex_unlock(&lli->lli_och_mutex);
763 if (rc < 0) /* more than 1 opener */
766 LASSERT(fd->fd_och != NULL);
767 old_handle = fd->fd_och->och_fh;
772 RETURN(ERR_PTR(-ENOMEM));
774 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
775 LUSTRE_OPC_ANY, NULL);
777 GOTO(out, rc = PTR_ERR(op_data));
779 /* To tell the MDT this openhandle is from the same owner */
780 op_data->op_handle = old_handle;
782 it.it_flags = fmode | open_flags;
783 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
784 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
785 &ll_md_blocking_lease_ast,
786 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
787 * it can be cancelled which may mislead applications that the lease is
789 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
790 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
791 * doesn't deal with openhandle, so normal openhandle will be leaked. */
792 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
793 ll_finish_md_op_data(op_data);
794 ptlrpc_req_finished(req);
796 GOTO(out_release_it, rc);
798 if (it_disposition(&it, DISP_LOOKUP_NEG))
799 GOTO(out_release_it, rc = -ENOENT);
801 rc = it_open_error(DISP_OPEN_OPEN, &it);
803 GOTO(out_release_it, rc);
805 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
806 ll_och_fill(sbi->ll_md_exp, &it, och);
808 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
809 GOTO(out_close, rc = -EOPNOTSUPP);
811 /* already get lease, handle lease lock */
812 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
813 if (it.d.lustre.it_lock_mode == 0 ||
814 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
815 /* open lock must return for lease */
816 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
817 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
818 it.d.lustre.it_lock_bits);
819 GOTO(out_close, rc = -EPROTO);
822 ll_intent_release(&it);
826 /* Cancel open lock */
827 if (it.d.lustre.it_lock_mode != 0) {
828 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
829 it.d.lustre.it_lock_mode);
830 it.d.lustre.it_lock_mode = 0;
831 och->och_lease_handle.cookie = 0ULL;
833 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
835 CERROR("%s: error closing file "DFID": %d\n",
836 ll_get_fsname(inode->i_sb, NULL, 0),
837 PFID(&ll_i2info(inode)->lli_fid), rc2);
838 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
840 ll_intent_release(&it);
848 * Release lease and close the file.
849 * It will check if the lease has ever broken.
851 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
854 struct ldlm_lock *lock;
855 bool cancelled = true;
859 lock = ldlm_handle2lock(&och->och_lease_handle);
861 lock_res_and_lock(lock);
862 cancelled = ldlm_is_cancel(lock);
863 unlock_res_and_lock(lock);
867 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
868 PFID(&ll_i2info(inode)->lli_fid), cancelled);
871 ldlm_cli_cancel(&och->och_lease_handle, 0);
872 if (lease_broken != NULL)
873 *lease_broken = cancelled;
875 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
880 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
882 struct ll_inode_info *lli = ll_i2info(inode);
883 struct cl_object *obj = lli->lli_clob;
884 struct cl_attr *attr = vvp_env_thread_attr(env);
892 ll_inode_size_lock(inode);
894 /* merge timestamps the most recently obtained from mds with
895 timestamps obtained from osts */
896 LTIME_S(inode->i_atime) = lli->lli_atime;
897 LTIME_S(inode->i_mtime) = lli->lli_mtime;
898 LTIME_S(inode->i_ctime) = lli->lli_ctime;
900 atime = LTIME_S(inode->i_atime);
901 mtime = LTIME_S(inode->i_mtime);
902 ctime = LTIME_S(inode->i_ctime);
904 cl_object_attr_lock(obj);
905 rc = cl_object_attr_get(env, obj, attr);
906 cl_object_attr_unlock(obj);
909 GOTO(out_size_unlock, rc);
911 if (atime < attr->cat_atime)
912 atime = attr->cat_atime;
914 if (ctime < attr->cat_ctime)
915 ctime = attr->cat_ctime;
917 if (mtime < attr->cat_mtime)
918 mtime = attr->cat_mtime;
920 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
921 PFID(&lli->lli_fid), attr->cat_size);
923 i_size_write(inode, attr->cat_size);
924 inode->i_blocks = attr->cat_blocks;
926 LTIME_S(inode->i_atime) = atime;
927 LTIME_S(inode->i_mtime) = mtime;
928 LTIME_S(inode->i_ctime) = ctime;
931 ll_inode_size_unlock(inode);
936 static bool file_is_noatime(const struct file *file)
938 const struct vfsmount *mnt = file->f_path.mnt;
939 const struct inode *inode = file->f_path.dentry->d_inode;
941 /* Adapted from file_accessed() and touch_atime().*/
942 if (file->f_flags & O_NOATIME)
945 if (inode->i_flags & S_NOATIME)
948 if (IS_NOATIME(inode))
951 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
954 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
957 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
963 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
965 struct inode *inode = file->f_dentry->d_inode;
967 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
969 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
970 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
971 file->f_flags & O_DIRECT ||
974 io->ci_obj = ll_i2info(inode)->lli_clob;
975 io->ci_lockreq = CILR_MAYBE;
976 if (ll_file_nolock(file)) {
977 io->ci_lockreq = CILR_NEVER;
978 io->ci_no_srvlock = 1;
979 } else if (file->f_flags & O_APPEND) {
980 io->ci_lockreq = CILR_MANDATORY;
983 io->ci_noatime = file_is_noatime(file);
987 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
988 struct file *file, enum cl_io_type iot,
989 loff_t *ppos, size_t count)
991 struct inode *inode = file->f_dentry->d_inode;
992 struct ll_inode_info *lli = ll_i2info(inode);
994 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
997 struct range_lock range;
1000 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zu\n",
1001 file->f_dentry->d_name.name, iot, *ppos, count);
1004 io = vvp_env_thread_io(env);
1005 ll_io_init(io, file, iot == CIT_WRITE);
1007 /* The maximum Lustre file size is variable, based on the
1008 * OST maximum object size and number of stripes. This
1009 * needs another check in addition to the VFS checks earlier. */
1010 end = (io->u.ci_wr.wr_append ? i_size_read(inode) : *ppos) + count;
1011 if (end > ll_file_maxbytes(inode)) {
1013 CDEBUG(D_INODE, "%s: file "DFID" offset %llu > maxbytes "LPU64
1014 ": rc = %zd\n", ll_get_fsname(inode->i_sb, NULL, 0),
1015 PFID(&lli->lli_fid), end, ll_file_maxbytes(inode),
1020 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1021 struct vvp_io *vio = vvp_env_io(env);
1022 bool range_locked = false;
1024 if (file->f_flags & O_APPEND)
1025 range_lock_init(&range, 0, LUSTRE_EOF);
1027 range_lock_init(&range, *ppos, *ppos + count - 1);
1029 vio->vui_fd = LUSTRE_FPRIVATE(file);
1030 vio->vui_io_subtype = args->via_io_subtype;
1032 switch (vio->vui_io_subtype) {
1034 vio->vui_iov = args->u.normal.via_iov;
1035 vio->vui_nrsegs = args->u.normal.via_nrsegs;
1036 vio->vui_tot_nrsegs = vio->vui_nrsegs;
1037 vio->vui_iocb = args->u.normal.via_iocb;
1038 if ((iot == CIT_WRITE) &&
1039 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1040 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1042 result = range_lock(&lli->lli_write_tree,
1047 range_locked = true;
1049 down_read(&lli->lli_trunc_sem);
1052 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1053 vio->u.splice.vui_flags = args->u.splice.via_flags;
1056 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1060 ll_cl_add(file, env, io);
1061 result = cl_io_loop(env, io);
1062 ll_cl_remove(file, env);
1064 if (args->via_io_subtype == IO_NORMAL)
1065 up_read(&lli->lli_trunc_sem);
1067 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1069 range_unlock(&lli->lli_write_tree, &range);
1072 /* cl_io_rw_init() handled IO */
1073 result = io->ci_result;
1076 if (io->ci_nob > 0) {
1077 result = io->ci_nob;
1078 *ppos = io->u.ci_wr.wr.crw_pos;
1082 cl_io_fini(env, io);
1083 /* If any bit been read/written (result != 0), we just return
1084 * short read/write instead of restart io. */
1085 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1086 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zu\n",
1087 iot == CIT_READ ? "read" : "write",
1088 file->f_dentry->d_name.name, *ppos, count);
1089 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1093 if (iot == CIT_READ) {
1095 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1096 LPROC_LL_READ_BYTES, result);
1097 } else if (iot == CIT_WRITE) {
1099 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1100 LPROC_LL_WRITE_BYTES, result);
1101 fd->fd_write_failed = false;
1102 } else if (result != -ERESTARTSYS) {
1103 fd->fd_write_failed = true;
1106 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1113 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1115 static int ll_file_get_iov_count(const struct iovec *iov,
1116 unsigned long *nr_segs, size_t *count)
1121 for (seg = 0; seg < *nr_segs; seg++) {
1122 const struct iovec *iv = &iov[seg];
1125 * If any segment has a negative length, or the cumulative
1126 * length ever wraps negative then return -EINVAL.
1129 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1131 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1136 cnt -= iv->iov_len; /* This segment is no good */
1143 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1144 unsigned long nr_segs, loff_t pos)
1147 struct vvp_io_args *args;
1153 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1157 env = cl_env_get(&refcheck);
1159 RETURN(PTR_ERR(env));
1161 args = ll_env_args(env, IO_NORMAL);
1162 args->u.normal.via_iov = (struct iovec *)iov;
1163 args->u.normal.via_nrsegs = nr_segs;
1164 args->u.normal.via_iocb = iocb;
1166 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1167 &iocb->ki_pos, count);
1168 cl_env_put(env, &refcheck);
1172 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1176 struct iovec *local_iov;
1177 struct kiocb *kiocb;
1182 env = cl_env_get(&refcheck);
1184 RETURN(PTR_ERR(env));
1186 local_iov = &ll_env_info(env)->lti_local_iov;
1187 kiocb = &ll_env_info(env)->lti_kiocb;
1188 local_iov->iov_base = (void __user *)buf;
1189 local_iov->iov_len = count;
1190 init_sync_kiocb(kiocb, file);
1191 kiocb->ki_pos = *ppos;
1192 #ifdef HAVE_KIOCB_KI_LEFT
1193 kiocb->ki_left = count;
1195 kiocb->ki_nbytes = count;
1198 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1199 *ppos = kiocb->ki_pos;
1201 cl_env_put(env, &refcheck);
1206 * Write to a file (through the page cache).
1209 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1210 unsigned long nr_segs, loff_t pos)
1213 struct vvp_io_args *args;
1219 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1223 env = cl_env_get(&refcheck);
1225 RETURN(PTR_ERR(env));
1227 args = ll_env_args(env, IO_NORMAL);
1228 args->u.normal.via_iov = (struct iovec *)iov;
1229 args->u.normal.via_nrsegs = nr_segs;
1230 args->u.normal.via_iocb = iocb;
1232 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1233 &iocb->ki_pos, count);
1234 cl_env_put(env, &refcheck);
1238 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1239 size_t count, loff_t *ppos)
1242 struct iovec *local_iov;
1243 struct kiocb *kiocb;
1248 env = cl_env_get(&refcheck);
1250 RETURN(PTR_ERR(env));
1252 local_iov = &ll_env_info(env)->lti_local_iov;
1253 kiocb = &ll_env_info(env)->lti_kiocb;
1254 local_iov->iov_base = (void __user *)buf;
1255 local_iov->iov_len = count;
1256 init_sync_kiocb(kiocb, file);
1257 kiocb->ki_pos = *ppos;
1258 #ifdef HAVE_KIOCB_KI_LEFT
1259 kiocb->ki_left = count;
1261 kiocb->ki_nbytes = count;
1264 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1265 *ppos = kiocb->ki_pos;
1267 cl_env_put(env, &refcheck);
1272 * Send file content (through pagecache) somewhere with helper
1274 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1275 struct pipe_inode_info *pipe, size_t count,
1279 struct vvp_io_args *args;
1284 env = cl_env_get(&refcheck);
1286 RETURN(PTR_ERR(env));
1288 args = ll_env_args(env, IO_SPLICE);
1289 args->u.splice.via_pipe = pipe;
1290 args->u.splice.via_flags = flags;
1292 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1293 cl_env_put(env, &refcheck);
1297 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1298 __u64 flags, struct lov_user_md *lum,
1301 struct lookup_intent oit = {
1303 .it_flags = flags | MDS_OPEN_BY_FID,
1308 ll_inode_size_lock(inode);
1309 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1311 GOTO(out_unlock, rc);
1313 ll_release_openhandle(file->f_dentry, &oit);
1316 ll_inode_size_unlock(inode);
1317 ll_intent_release(&oit);
1318 cl_lov_delay_create_clear(&file->f_flags);
1323 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1324 struct lov_mds_md **lmmp, int *lmm_size,
1325 struct ptlrpc_request **request)
1327 struct ll_sb_info *sbi = ll_i2sbi(inode);
1328 struct mdt_body *body;
1329 struct lov_mds_md *lmm = NULL;
1330 struct ptlrpc_request *req = NULL;
1331 struct md_op_data *op_data;
1334 rc = ll_get_default_mdsize(sbi, &lmmsize);
1338 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1339 strlen(filename), lmmsize,
1340 LUSTRE_OPC_ANY, NULL);
1341 if (IS_ERR(op_data))
1342 RETURN(PTR_ERR(op_data));
1344 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1345 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1346 ll_finish_md_op_data(op_data);
1348 CDEBUG(D_INFO, "md_getattr_name failed "
1349 "on %s: rc %d\n", filename, rc);
1353 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1354 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1356 lmmsize = body->mbo_eadatasize;
1358 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1360 GOTO(out, rc = -ENODATA);
1363 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1364 LASSERT(lmm != NULL);
1366 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1367 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1368 GOTO(out, rc = -EPROTO);
1372 * This is coming from the MDS, so is probably in
1373 * little endian. We convert it to host endian before
1374 * passing it to userspace.
1376 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1379 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1380 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1383 /* if function called for directory - we should
1384 * avoid swab not existent lsm objects */
1385 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1386 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1387 if (S_ISREG(body->mbo_mode))
1388 lustre_swab_lov_user_md_objects(
1389 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1391 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1392 lustre_swab_lov_user_md_v3(
1393 (struct lov_user_md_v3 *)lmm);
1394 if (S_ISREG(body->mbo_mode))
1395 lustre_swab_lov_user_md_objects(
1396 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1403 *lmm_size = lmmsize;
1408 static int ll_lov_setea(struct inode *inode, struct file *file,
1411 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1412 struct lov_user_md *lump;
1413 int lum_size = sizeof(struct lov_user_md) +
1414 sizeof(struct lov_user_ost_data);
1418 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1421 OBD_ALLOC_LARGE(lump, lum_size);
1425 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1426 OBD_FREE_LARGE(lump, lum_size);
1430 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1432 OBD_FREE_LARGE(lump, lum_size);
1436 static int ll_file_getstripe(struct inode *inode,
1437 struct lov_user_md __user *lum)
1444 env = cl_env_get(&refcheck);
1446 RETURN(PTR_ERR(env));
1448 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum);
1449 cl_env_put(env, &refcheck);
1453 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1456 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1457 struct lov_user_md *klum;
1459 __u64 flags = FMODE_WRITE;
1462 rc = ll_copy_user_md(lum, &klum);
1467 rc = ll_lov_setstripe_ea_info(inode, file, flags, klum, lum_size);
1471 put_user(0, &lum->lmm_stripe_count);
1473 ll_layout_refresh(inode, &gen);
1474 rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg);
1477 OBD_FREE(klum, lum_size);
1482 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1484 struct ll_inode_info *lli = ll_i2info(inode);
1485 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1486 struct ll_grouplock grouplock;
1491 CWARN("group id for group lock must not be 0\n");
1495 if (ll_file_nolock(file))
1496 RETURN(-EOPNOTSUPP);
1498 spin_lock(&lli->lli_lock);
1499 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1500 CWARN("group lock already existed with gid %lu\n",
1501 fd->fd_grouplock.lg_gid);
1502 spin_unlock(&lli->lli_lock);
1505 LASSERT(fd->fd_grouplock.lg_lock == NULL);
1506 spin_unlock(&lli->lli_lock);
1508 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
1509 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1513 spin_lock(&lli->lli_lock);
1514 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1515 spin_unlock(&lli->lli_lock);
1516 CERROR("another thread just won the race\n");
1517 cl_put_grouplock(&grouplock);
1521 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1522 fd->fd_grouplock = grouplock;
1523 spin_unlock(&lli->lli_lock);
1525 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1529 static int ll_put_grouplock(struct inode *inode, struct file *file,
1532 struct ll_inode_info *lli = ll_i2info(inode);
1533 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1534 struct ll_grouplock grouplock;
1537 spin_lock(&lli->lli_lock);
1538 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1539 spin_unlock(&lli->lli_lock);
1540 CWARN("no group lock held\n");
1544 LASSERT(fd->fd_grouplock.lg_lock != NULL);
1546 if (fd->fd_grouplock.lg_gid != arg) {
1547 CWARN("group lock %lu doesn't match current id %lu\n",
1548 arg, fd->fd_grouplock.lg_gid);
1549 spin_unlock(&lli->lli_lock);
1553 grouplock = fd->fd_grouplock;
1554 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1555 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1556 spin_unlock(&lli->lli_lock);
1558 cl_put_grouplock(&grouplock);
1559 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1564 * Close inode open handle
1566 * \param dentry [in] dentry which contains the inode
1567 * \param it [in,out] intent which contains open info and result
1570 * \retval <0 failure
1572 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1574 struct inode *inode = dentry->d_inode;
1575 struct obd_client_handle *och;
1581 /* Root ? Do nothing. */
1582 if (dentry->d_inode->i_sb->s_root == dentry)
1585 /* No open handle to close? Move away */
1586 if (!it_disposition(it, DISP_OPEN_OPEN))
1589 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1591 OBD_ALLOC(och, sizeof(*och));
1593 GOTO(out, rc = -ENOMEM);
1595 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1597 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1600 /* this one is in place of ll_file_open */
1601 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1602 ptlrpc_req_finished(it->d.lustre.it_data);
1603 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1609 * Get size for inode for which FIEMAP mapping is requested.
1610 * Make the FIEMAP get_info call and returns the result.
1611 * \param fiemap kernel buffer to hold extens
1612 * \param num_bytes kernel buffer size
1614 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
1620 struct ll_fiemap_info_key fmkey = { .name = KEY_FIEMAP, };
1623 /* Checks for fiemap flags */
1624 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1625 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1629 /* Check for FIEMAP_FLAG_SYNC */
1630 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1631 rc = filemap_fdatawrite(inode->i_mapping);
1636 env = cl_env_get(&refcheck);
1638 RETURN(PTR_ERR(env));
1640 if (i_size_read(inode) == 0) {
1641 rc = ll_glimpse_size(inode);
1646 fmkey.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1647 obdo_from_inode(&fmkey.oa, inode, OBD_MD_FLSIZE);
1648 obdo_set_parent_fid(&fmkey.oa, &ll_i2info(inode)->lli_fid);
1650 /* If filesize is 0, then there would be no objects for mapping */
1651 if (fmkey.oa.o_size == 0) {
1652 fiemap->fm_mapped_extents = 0;
1656 fmkey.fiemap = *fiemap;
1658 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
1659 &fmkey, fiemap, &num_bytes);
1661 cl_env_put(env, &refcheck);
1665 int ll_fid2path(struct inode *inode, void __user *arg)
1667 struct obd_export *exp = ll_i2mdexp(inode);
1668 const struct getinfo_fid2path __user *gfin = arg;
1670 struct getinfo_fid2path *gfout;
1676 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1677 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1680 /* Only need to get the buflen */
1681 if (get_user(pathlen, &gfin->gf_pathlen))
1684 if (pathlen > PATH_MAX)
1687 outsize = sizeof(*gfout) + pathlen;
1688 OBD_ALLOC(gfout, outsize);
1692 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1693 GOTO(gf_free, rc = -EFAULT);
1695 /* Call mdc_iocontrol */
1696 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1700 if (copy_to_user(arg, gfout, outsize))
1704 OBD_FREE(gfout, outsize);
1708 static int ll_ioctl_fiemap(struct inode *inode, struct fiemap __user *arg)
1710 struct fiemap *fiemap;
1716 /* Get the extent count so we can calculate the size of
1717 * required fiemap buffer */
1718 if (get_user(extent_count, &arg->fm_extent_count))
1722 (SIZE_MAX - sizeof(*fiemap)) / sizeof(struct ll_fiemap_extent))
1724 num_bytes = sizeof(*fiemap) + (extent_count *
1725 sizeof(struct ll_fiemap_extent));
1727 OBD_ALLOC_LARGE(fiemap, num_bytes);
1731 /* get the fiemap value */
1732 if (copy_from_user(fiemap, arg, sizeof(*fiemap)))
1733 GOTO(error, rc = -EFAULT);
1735 /* If fm_extent_count is non-zero, read the first extent since
1736 * it is used to calculate end_offset and device from previous
1738 if (extent_count != 0) {
1739 if (copy_from_user(&fiemap->fm_extents[0],
1740 (char __user *)arg + sizeof(*fiemap),
1741 sizeof(struct ll_fiemap_extent)))
1742 GOTO(error, rc = -EFAULT);
1745 rc = ll_do_fiemap(inode, fiemap, num_bytes);
1749 ret_bytes = sizeof(struct fiemap);
1751 if (extent_count != 0)
1752 ret_bytes += (fiemap->fm_mapped_extents *
1753 sizeof(struct ll_fiemap_extent));
1755 if (copy_to_user((void __user *)arg, fiemap, ret_bytes))
1759 OBD_FREE_LARGE(fiemap, num_bytes);
1764 * Read the data_version for inode.
1766 * This value is computed using stripe object version on OST.
1767 * Version is computed using server side locking.
1769 * @param flags if do sync on the OST side;
1771 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
1772 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
1774 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
1781 /* If no file object initialized, we consider its version is 0. */
1782 if (ll_i2info(inode)->lli_clob == NULL) {
1787 env = cl_env_get(&refcheck);
1789 RETURN(PTR_ERR(env));
1791 rc = cl_object_data_version(env, ll_i2info(inode)->lli_clob,
1792 data_version, flags);
1793 cl_env_put(env, &refcheck);
1798 * Trigger a HSM release request for the provided inode.
1800 int ll_hsm_release(struct inode *inode)
1802 struct cl_env_nest nest;
1804 struct obd_client_handle *och = NULL;
1805 __u64 data_version = 0;
1809 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1810 ll_get_fsname(inode->i_sb, NULL, 0),
1811 PFID(&ll_i2info(inode)->lli_fid));
1813 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1815 GOTO(out, rc = PTR_ERR(och));
1817 /* Grab latest data_version and [am]time values */
1818 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
1822 env = cl_env_nested_get(&nest);
1824 GOTO(out, rc = PTR_ERR(env));
1826 ll_merge_attr(env, inode);
1827 cl_env_nested_put(&nest, env);
1829 /* Release the file.
1830 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1831 * we still need it to pack l_remote_handle to MDT. */
1832 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1838 if (och != NULL && !IS_ERR(och)) /* close the file */
1839 ll_lease_close(och, inode, NULL);
1844 struct ll_swap_stack {
1845 struct iattr ia1, ia2;
1847 struct inode *inode1, *inode2;
1848 bool check_dv1, check_dv2;
1851 static int ll_swap_layouts(struct file *file1, struct file *file2,
1852 struct lustre_swap_layouts *lsl)
1854 struct mdc_swap_layouts msl;
1855 struct md_op_data *op_data;
1858 struct ll_swap_stack *llss = NULL;
1861 OBD_ALLOC_PTR(llss);
1865 llss->inode1 = file1->f_dentry->d_inode;
1866 llss->inode2 = file2->f_dentry->d_inode;
1868 if (!S_ISREG(llss->inode2->i_mode))
1869 GOTO(free, rc = -EINVAL);
1871 if (inode_permission(llss->inode1, MAY_WRITE) ||
1872 inode_permission(llss->inode2, MAY_WRITE))
1873 GOTO(free, rc = -EPERM);
1875 if (llss->inode2->i_sb != llss->inode1->i_sb)
1876 GOTO(free, rc = -EXDEV);
1878 /* we use 2 bool because it is easier to swap than 2 bits */
1879 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1880 llss->check_dv1 = true;
1882 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1883 llss->check_dv2 = true;
1885 /* we cannot use lsl->sl_dvX directly because we may swap them */
1886 llss->dv1 = lsl->sl_dv1;
1887 llss->dv2 = lsl->sl_dv2;
1889 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1890 if (rc == 0) /* same file, done! */
1893 if (rc < 0) { /* sequentialize it */
1894 swap(llss->inode1, llss->inode2);
1896 swap(llss->dv1, llss->dv2);
1897 swap(llss->check_dv1, llss->check_dv2);
1901 if (gid != 0) { /* application asks to flush dirty cache */
1902 rc = ll_get_grouplock(llss->inode1, file1, gid);
1906 rc = ll_get_grouplock(llss->inode2, file2, gid);
1908 ll_put_grouplock(llss->inode1, file1, gid);
1913 /* to be able to restore mtime and atime after swap
1914 * we need to first save them */
1916 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1917 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1918 llss->ia1.ia_atime = llss->inode1->i_atime;
1919 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1920 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1921 llss->ia2.ia_atime = llss->inode2->i_atime;
1922 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1925 /* ultimate check, before swaping the layouts we check if
1926 * dataversion has changed (if requested) */
1927 if (llss->check_dv1) {
1928 rc = ll_data_version(llss->inode1, &dv, 0);
1931 if (dv != llss->dv1)
1932 GOTO(putgl, rc = -EAGAIN);
1935 if (llss->check_dv2) {
1936 rc = ll_data_version(llss->inode2, &dv, 0);
1939 if (dv != llss->dv2)
1940 GOTO(putgl, rc = -EAGAIN);
1943 /* struct md_op_data is used to send the swap args to the mdt
1944 * only flags is missing, so we use struct mdc_swap_layouts
1945 * through the md_op_data->op_data */
1946 /* flags from user space have to be converted before they are send to
1947 * server, no flag is sent today, they are only used on the client */
1950 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1951 0, LUSTRE_OPC_ANY, &msl);
1952 if (IS_ERR(op_data))
1953 GOTO(free, rc = PTR_ERR(op_data));
1955 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1956 sizeof(*op_data), op_data, NULL);
1957 ll_finish_md_op_data(op_data);
1961 ll_put_grouplock(llss->inode2, file2, gid);
1962 ll_put_grouplock(llss->inode1, file1, gid);
1965 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1969 /* clear useless flags */
1970 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1971 llss->ia1.ia_valid &= ~ATTR_MTIME;
1972 llss->ia2.ia_valid &= ~ATTR_MTIME;
1975 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1976 llss->ia1.ia_valid &= ~ATTR_ATIME;
1977 llss->ia2.ia_valid &= ~ATTR_ATIME;
1980 /* update time if requested */
1982 if (llss->ia2.ia_valid != 0) {
1983 mutex_lock(&llss->inode1->i_mutex);
1984 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1985 mutex_unlock(&llss->inode1->i_mutex);
1988 if (llss->ia1.ia_valid != 0) {
1991 mutex_lock(&llss->inode2->i_mutex);
1992 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
1993 mutex_unlock(&llss->inode2->i_mutex);
2005 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2007 struct md_op_data *op_data;
2011 /* Detect out-of range masks */
2012 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2015 /* Non-root users are forbidden to set or clear flags which are
2016 * NOT defined in HSM_USER_MASK. */
2017 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2018 !cfs_capable(CFS_CAP_SYS_ADMIN))
2021 /* Detect out-of range archive id */
2022 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2023 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2026 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2027 LUSTRE_OPC_ANY, hss);
2028 if (IS_ERR(op_data))
2029 RETURN(PTR_ERR(op_data));
2031 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2032 sizeof(*op_data), op_data, NULL);
2034 ll_finish_md_op_data(op_data);
2039 static int ll_hsm_import(struct inode *inode, struct file *file,
2040 struct hsm_user_import *hui)
2042 struct hsm_state_set *hss = NULL;
2043 struct iattr *attr = NULL;
2047 if (!S_ISREG(inode->i_mode))
2053 GOTO(out, rc = -ENOMEM);
2055 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2056 hss->hss_archive_id = hui->hui_archive_id;
2057 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2058 rc = ll_hsm_state_set(inode, hss);
2062 OBD_ALLOC_PTR(attr);
2064 GOTO(out, rc = -ENOMEM);
2066 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2067 attr->ia_mode |= S_IFREG;
2068 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2069 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2070 attr->ia_size = hui->hui_size;
2071 attr->ia_mtime.tv_sec = hui->hui_mtime;
2072 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2073 attr->ia_atime.tv_sec = hui->hui_atime;
2074 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2076 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2077 ATTR_UID | ATTR_GID |
2078 ATTR_MTIME | ATTR_MTIME_SET |
2079 ATTR_ATIME | ATTR_ATIME_SET;
2081 mutex_lock(&inode->i_mutex);
2083 rc = ll_setattr_raw(file->f_dentry, attr, true);
2087 mutex_unlock(&inode->i_mutex);
2099 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2101 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2102 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2106 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2108 struct inode *inode = file->f_dentry->d_inode;
2109 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2113 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2114 PFID(ll_inode2fid(inode)), inode, cmd);
2115 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2117 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2118 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2122 case LL_IOC_GETFLAGS:
2123 /* Get the current value of the file flags */
2124 return put_user(fd->fd_flags, (int __user *)arg);
2125 case LL_IOC_SETFLAGS:
2126 case LL_IOC_CLRFLAGS:
2127 /* Set or clear specific file flags */
2128 /* XXX This probably needs checks to ensure the flags are
2129 * not abused, and to handle any flag side effects.
2131 if (get_user(flags, (int __user *) arg))
2134 if (cmd == LL_IOC_SETFLAGS) {
2135 if ((flags & LL_FILE_IGNORE_LOCK) &&
2136 !(file->f_flags & O_DIRECT)) {
2137 CERROR("%s: unable to disable locking on "
2138 "non-O_DIRECT file\n", current->comm);
2142 fd->fd_flags |= flags;
2144 fd->fd_flags &= ~flags;
2147 case LL_IOC_LOV_SETSTRIPE:
2148 RETURN(ll_lov_setstripe(inode, file, arg));
2149 case LL_IOC_LOV_SETEA:
2150 RETURN(ll_lov_setea(inode, file, arg));
2151 case LL_IOC_LOV_SWAP_LAYOUTS: {
2153 struct lustre_swap_layouts lsl;
2155 if (copy_from_user(&lsl, (char __user *)arg,
2156 sizeof(struct lustre_swap_layouts)))
2159 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2162 file2 = fget(lsl.sl_fd);
2167 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2168 rc = ll_swap_layouts(file, file2, &lsl);
2172 case LL_IOC_LOV_GETSTRIPE:
2173 RETURN(ll_file_getstripe(inode,
2174 (struct lov_user_md __user *)arg));
2175 case FSFILT_IOC_FIEMAP:
2176 RETURN(ll_ioctl_fiemap(inode, (struct fiemap __user *)arg));
2177 case FSFILT_IOC_GETFLAGS:
2178 case FSFILT_IOC_SETFLAGS:
2179 RETURN(ll_iocontrol(inode, file, cmd, arg));
2180 case FSFILT_IOC_GETVERSION_OLD:
2181 case FSFILT_IOC_GETVERSION:
2182 RETURN(put_user(inode->i_generation, (int __user *)arg));
2183 case LL_IOC_GROUP_LOCK:
2184 RETURN(ll_get_grouplock(inode, file, arg));
2185 case LL_IOC_GROUP_UNLOCK:
2186 RETURN(ll_put_grouplock(inode, file, arg));
2187 case IOC_OBD_STATFS:
2188 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2190 /* We need to special case any other ioctls we want to handle,
2191 * to send them to the MDS/OST as appropriate and to properly
2192 * network encode the arg field.
2193 case FSFILT_IOC_SETVERSION_OLD:
2194 case FSFILT_IOC_SETVERSION:
2196 case LL_IOC_FLUSHCTX:
2197 RETURN(ll_flush_ctx(inode));
2198 case LL_IOC_PATH2FID: {
2199 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2200 sizeof(struct lu_fid)))
2205 case LL_IOC_GETPARENT:
2206 RETURN(ll_getparent(file, (struct getparent __user *)arg));
2208 case OBD_IOC_FID2PATH:
2209 RETURN(ll_fid2path(inode, (void __user *)arg));
2210 case LL_IOC_DATA_VERSION: {
2211 struct ioc_data_version idv;
2214 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2217 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2218 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2221 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2227 case LL_IOC_GET_MDTIDX: {
2230 mdtidx = ll_get_mdt_idx(inode);
2234 if (put_user((int)mdtidx, (int __user *)arg))
2239 case OBD_IOC_GETDTNAME:
2240 case OBD_IOC_GETMDNAME:
2241 RETURN(ll_get_obd_name(inode, cmd, arg));
2242 case LL_IOC_HSM_STATE_GET: {
2243 struct md_op_data *op_data;
2244 struct hsm_user_state *hus;
2251 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2252 LUSTRE_OPC_ANY, hus);
2253 if (IS_ERR(op_data)) {
2255 RETURN(PTR_ERR(op_data));
2258 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2261 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2264 ll_finish_md_op_data(op_data);
2268 case LL_IOC_HSM_STATE_SET: {
2269 struct hsm_state_set *hss;
2276 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2281 rc = ll_hsm_state_set(inode, hss);
2286 case LL_IOC_HSM_ACTION: {
2287 struct md_op_data *op_data;
2288 struct hsm_current_action *hca;
2295 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2296 LUSTRE_OPC_ANY, hca);
2297 if (IS_ERR(op_data)) {
2299 RETURN(PTR_ERR(op_data));
2302 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2305 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2308 ll_finish_md_op_data(op_data);
2312 case LL_IOC_SET_LEASE: {
2313 struct ll_inode_info *lli = ll_i2info(inode);
2314 struct obd_client_handle *och = NULL;
2319 case LL_LEASE_WRLCK:
2320 if (!(file->f_mode & FMODE_WRITE))
2322 fmode = FMODE_WRITE;
2324 case LL_LEASE_RDLCK:
2325 if (!(file->f_mode & FMODE_READ))
2329 case LL_LEASE_UNLCK:
2330 mutex_lock(&lli->lli_och_mutex);
2331 if (fd->fd_lease_och != NULL) {
2332 och = fd->fd_lease_och;
2333 fd->fd_lease_och = NULL;
2335 mutex_unlock(&lli->lli_och_mutex);
2340 fmode = och->och_flags;
2341 rc = ll_lease_close(och, inode, &lease_broken);
2348 RETURN(ll_lease_type_from_fmode(fmode));
2353 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2355 /* apply for lease */
2356 och = ll_lease_open(inode, file, fmode, 0);
2358 RETURN(PTR_ERR(och));
2361 mutex_lock(&lli->lli_och_mutex);
2362 if (fd->fd_lease_och == NULL) {
2363 fd->fd_lease_och = och;
2366 mutex_unlock(&lli->lli_och_mutex);
2368 /* impossible now that only excl is supported for now */
2369 ll_lease_close(och, inode, &lease_broken);
2374 case LL_IOC_GET_LEASE: {
2375 struct ll_inode_info *lli = ll_i2info(inode);
2376 struct ldlm_lock *lock = NULL;
2379 mutex_lock(&lli->lli_och_mutex);
2380 if (fd->fd_lease_och != NULL) {
2381 struct obd_client_handle *och = fd->fd_lease_och;
2383 lock = ldlm_handle2lock(&och->och_lease_handle);
2385 lock_res_and_lock(lock);
2386 if (!ldlm_is_cancel(lock))
2387 fmode = och->och_flags;
2389 unlock_res_and_lock(lock);
2390 LDLM_LOCK_PUT(lock);
2393 mutex_unlock(&lli->lli_och_mutex);
2395 RETURN(ll_lease_type_from_fmode(fmode));
2397 case LL_IOC_HSM_IMPORT: {
2398 struct hsm_user_import *hui;
2404 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2409 rc = ll_hsm_import(inode, file, hui);
2419 ll_iocontrol_call(inode, file, cmd, arg, &err))
2422 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2423 (void __user *)arg));
2428 #ifndef HAVE_FILE_LLSEEK_SIZE
2429 static inline loff_t
2430 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2432 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2434 if (offset > maxsize)
2437 if (offset != file->f_pos) {
2438 file->f_pos = offset;
2439 file->f_version = 0;
2445 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2446 loff_t maxsize, loff_t eof)
2448 struct inode *inode = file->f_dentry->d_inode;
2456 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2457 * position-querying operation. Avoid rewriting the "same"
2458 * f_pos value back to the file because a concurrent read(),
2459 * write() or lseek() might have altered it
2464 * f_lock protects against read/modify/write race with other
2465 * SEEK_CURs. Note that parallel writes and reads behave
2468 mutex_lock(&inode->i_mutex);
2469 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2470 mutex_unlock(&inode->i_mutex);
2474 * In the generic case the entire file is data, so as long as
2475 * offset isn't at the end of the file then the offset is data.
2482 * There is a virtual hole at the end of the file, so as long as
2483 * offset isn't i_size or larger, return i_size.
2491 return llseek_execute(file, offset, maxsize);
2495 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2497 struct inode *inode = file->f_dentry->d_inode;
2498 loff_t retval, eof = 0;
2501 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2502 (origin == SEEK_CUR) ? file->f_pos : 0);
2503 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2504 PFID(ll_inode2fid(inode)), inode, retval, retval,
2506 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2508 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2509 retval = ll_glimpse_size(inode);
2512 eof = i_size_read(inode);
2515 retval = ll_generic_file_llseek_size(file, offset, origin,
2516 ll_file_maxbytes(inode), eof);
2520 static int ll_flush(struct file *file, fl_owner_t id)
2522 struct inode *inode = file->f_dentry->d_inode;
2523 struct ll_inode_info *lli = ll_i2info(inode);
2524 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2527 LASSERT(!S_ISDIR(inode->i_mode));
2529 /* catch async errors that were recorded back when async writeback
2530 * failed for pages in this mapping. */
2531 rc = lli->lli_async_rc;
2532 lli->lli_async_rc = 0;
2533 if (lli->lli_clob != NULL) {
2534 err = lov_read_and_clear_async_rc(lli->lli_clob);
2539 /* The application has been told write failure already.
2540 * Do not report failure again. */
2541 if (fd->fd_write_failed)
2543 return rc ? -EIO : 0;
2547 * Called to make sure a portion of file has been written out.
2548 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2550 * Return how many pages have been written.
2552 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2553 enum cl_fsync_mode mode, int ignore_layout)
2555 struct cl_env_nest nest;
2558 struct obd_capa *capa = NULL;
2559 struct cl_fsync_io *fio;
2563 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2564 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2567 env = cl_env_nested_get(&nest);
2569 RETURN(PTR_ERR(env));
2571 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2573 io = vvp_env_thread_io(env);
2574 io->ci_obj = ll_i2info(inode)->lli_clob;
2575 io->ci_ignore_layout = ignore_layout;
2577 /* initialize parameters for sync */
2578 fio = &io->u.ci_fsync;
2579 fio->fi_capa = capa;
2580 fio->fi_start = start;
2582 fio->fi_fid = ll_inode2fid(inode);
2583 fio->fi_mode = mode;
2584 fio->fi_nr_written = 0;
2586 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2587 result = cl_io_loop(env, io);
2589 result = io->ci_result;
2591 result = fio->fi_nr_written;
2592 cl_io_fini(env, io);
2593 cl_env_nested_put(&nest, env);
2601 * When dentry is provided (the 'else' case), *file->f_dentry may be
2602 * null and dentry must be used directly rather than pulled from
2603 * *file->f_dentry as is done otherwise.
2606 #ifdef HAVE_FILE_FSYNC_4ARGS
2607 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2609 struct dentry *dentry = file->f_dentry;
2610 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2611 int ll_fsync(struct file *file, int datasync)
2613 struct dentry *dentry = file->f_dentry;
2615 loff_t end = LLONG_MAX;
2617 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2620 loff_t end = LLONG_MAX;
2622 struct inode *inode = dentry->d_inode;
2623 struct ll_inode_info *lli = ll_i2info(inode);
2624 struct ptlrpc_request *req;
2625 struct obd_capa *oc;
2629 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2630 PFID(ll_inode2fid(inode)), inode);
2631 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2633 #ifdef HAVE_FILE_FSYNC_4ARGS
2634 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2635 mutex_lock(&inode->i_mutex);
2637 /* fsync's caller has already called _fdata{sync,write}, we want
2638 * that IO to finish before calling the osc and mdc sync methods */
2639 rc = filemap_fdatawait(inode->i_mapping);
2642 /* catch async errors that were recorded back when async writeback
2643 * failed for pages in this mapping. */
2644 if (!S_ISDIR(inode->i_mode)) {
2645 err = lli->lli_async_rc;
2646 lli->lli_async_rc = 0;
2649 err = lov_read_and_clear_async_rc(lli->lli_clob);
2654 oc = ll_mdscapa_get(inode);
2655 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2661 ptlrpc_req_finished(req);
2663 if (S_ISREG(inode->i_mode)) {
2664 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2666 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2667 if (rc == 0 && err < 0)
2670 fd->fd_write_failed = true;
2672 fd->fd_write_failed = false;
2675 #ifdef HAVE_FILE_FSYNC_4ARGS
2676 mutex_unlock(&inode->i_mutex);
2682 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2684 struct inode *inode = file->f_dentry->d_inode;
2685 struct ll_sb_info *sbi = ll_i2sbi(inode);
2686 struct ldlm_enqueue_info einfo = {
2687 .ei_type = LDLM_FLOCK,
2688 .ei_cb_cp = ldlm_flock_completion_ast,
2689 .ei_cbdata = file_lock,
2691 struct md_op_data *op_data;
2692 struct lustre_handle lockh = {0};
2693 ldlm_policy_data_t flock = {{0}};
2694 int fl_type = file_lock->fl_type;
2700 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2701 PFID(ll_inode2fid(inode)), file_lock);
2703 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2705 if (file_lock->fl_flags & FL_FLOCK) {
2706 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2707 /* flocks are whole-file locks */
2708 flock.l_flock.end = OFFSET_MAX;
2709 /* For flocks owner is determined by the local file desctiptor*/
2710 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2711 } else if (file_lock->fl_flags & FL_POSIX) {
2712 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2713 flock.l_flock.start = file_lock->fl_start;
2714 flock.l_flock.end = file_lock->fl_end;
2718 flock.l_flock.pid = file_lock->fl_pid;
2720 /* Somewhat ugly workaround for svc lockd.
2721 * lockd installs custom fl_lmops->lm_compare_owner that checks
2722 * for the fl_owner to be the same (which it always is on local node
2723 * I guess between lockd processes) and then compares pid.
2724 * As such we assign pid to the owner field to make it all work,
2725 * conflict with normal locks is unlikely since pid space and
2726 * pointer space for current->files are not intersecting */
2727 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2728 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2732 einfo.ei_mode = LCK_PR;
2735 /* An unlock request may or may not have any relation to
2736 * existing locks so we may not be able to pass a lock handle
2737 * via a normal ldlm_lock_cancel() request. The request may even
2738 * unlock a byte range in the middle of an existing lock. In
2739 * order to process an unlock request we need all of the same
2740 * information that is given with a normal read or write record
2741 * lock request. To avoid creating another ldlm unlock (cancel)
2742 * message we'll treat a LCK_NL flock request as an unlock. */
2743 einfo.ei_mode = LCK_NL;
2746 einfo.ei_mode = LCK_PW;
2749 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
2764 flags = LDLM_FL_BLOCK_NOWAIT;
2770 flags = LDLM_FL_TEST_LOCK;
2773 CERROR("unknown fcntl lock command: %d\n", cmd);
2777 /* Save the old mode so that if the mode in the lock changes we
2778 * can decrement the appropriate reader or writer refcount. */
2779 file_lock->fl_type = einfo.ei_mode;
2781 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2782 LUSTRE_OPC_ANY, NULL);
2783 if (IS_ERR(op_data))
2784 RETURN(PTR_ERR(op_data));
2786 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
2787 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
2788 flock.l_flock.pid, flags, einfo.ei_mode,
2789 flock.l_flock.start, flock.l_flock.end);
2791 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
2794 /* Restore the file lock type if not TEST lock. */
2795 if (!(flags & LDLM_FL_TEST_LOCK))
2796 file_lock->fl_type = fl_type;
2798 if ((file_lock->fl_flags & FL_FLOCK) &&
2799 (rc == 0 || file_lock->fl_type == F_UNLCK))
2800 rc2 = flock_lock_file_wait(file, file_lock);
2801 if ((file_lock->fl_flags & FL_POSIX) &&
2802 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2803 !(flags & LDLM_FL_TEST_LOCK))
2804 rc2 = posix_lock_file_wait(file, file_lock);
2806 if (rc2 && file_lock->fl_type != F_UNLCK) {
2807 einfo.ei_mode = LCK_NL;
2808 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
2813 ll_finish_md_op_data(op_data);
2818 int ll_get_fid_by_name(struct inode *parent, const char *name,
2819 int namelen, struct lu_fid *fid)
2821 struct md_op_data *op_data = NULL;
2822 struct mdt_body *body;
2823 struct ptlrpc_request *req;
2827 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
2828 LUSTRE_OPC_ANY, NULL);
2829 if (IS_ERR(op_data))
2830 RETURN(PTR_ERR(op_data));
2832 op_data->op_valid = OBD_MD_FLID;
2833 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
2834 ll_finish_md_op_data(op_data);
2838 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2840 GOTO(out_req, rc = -EFAULT);
2842 *fid = body->mbo_fid1;
2844 ptlrpc_req_finished(req);
2848 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
2849 const char *name, int namelen)
2851 struct dentry *dchild = NULL;
2852 struct inode *child_inode = NULL;
2853 struct md_op_data *op_data;
2854 struct ptlrpc_request *request = NULL;
2859 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
2860 name, PFID(ll_inode2fid(parent)), mdtidx);
2862 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
2863 0, LUSTRE_OPC_ANY, NULL);
2864 if (IS_ERR(op_data))
2865 RETURN(PTR_ERR(op_data));
2867 /* Get child FID first */
2868 qstr.hash = full_name_hash(name, namelen);
2871 dchild = d_lookup(file->f_dentry, &qstr);
2872 if (dchild != NULL) {
2873 if (dchild->d_inode != NULL) {
2874 child_inode = igrab(dchild->d_inode);
2875 if (child_inode != NULL) {
2876 mutex_lock(&child_inode->i_mutex);
2877 op_data->op_fid3 = *ll_inode2fid(child_inode);
2878 ll_invalidate_aliases(child_inode);
2883 rc = ll_get_fid_by_name(parent, name, namelen,
2889 if (!fid_is_sane(&op_data->op_fid3)) {
2890 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
2891 ll_get_fsname(parent->i_sb, NULL, 0), name,
2892 PFID(&op_data->op_fid3));
2893 GOTO(out_free, rc = -EINVAL);
2896 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
2901 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
2902 PFID(&op_data->op_fid3), mdtidx);
2903 GOTO(out_free, rc = 0);
2906 op_data->op_mds = mdtidx;
2907 op_data->op_cli_flags = CLI_MIGRATE;
2908 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
2909 namelen, name, namelen, &request);
2911 ll_update_times(request, parent);
2913 ptlrpc_req_finished(request);
2918 if (child_inode != NULL) {
2919 clear_nlink(child_inode);
2920 mutex_unlock(&child_inode->i_mutex);
2924 ll_finish_md_op_data(op_data);
2929 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2937 * test if some locks matching bits and l_req_mode are acquired
2938 * - bits can be in different locks
2939 * - if found clear the common lock bits in *bits
2940 * - the bits not found, are kept in *bits
2942 * \param bits [IN] searched lock bits [IN]
2943 * \param l_req_mode [IN] searched lock mode
2944 * \retval boolean, true iff all bits are found
2946 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2948 struct lustre_handle lockh;
2949 ldlm_policy_data_t policy;
2950 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2951 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2960 fid = &ll_i2info(inode)->lli_fid;
2961 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2962 ldlm_lockname[mode]);
2964 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2965 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2966 policy.l_inodebits.bits = *bits & (1 << i);
2967 if (policy.l_inodebits.bits == 0)
2970 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2971 &policy, mode, &lockh)) {
2972 struct ldlm_lock *lock;
2974 lock = ldlm_handle2lock(&lockh);
2977 ~(lock->l_policy_data.l_inodebits.bits);
2978 LDLM_LOCK_PUT(lock);
2980 *bits &= ~policy.l_inodebits.bits;
2987 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2988 struct lustre_handle *lockh, __u64 flags,
2991 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2996 fid = &ll_i2info(inode)->lli_fid;
2997 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2999 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3000 fid, LDLM_IBITS, &policy, mode, lockh);
3005 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3007 /* Already unlinked. Just update nlink and return success */
3008 if (rc == -ENOENT) {
3010 /* This path cannot be hit for regular files unless in
3011 * case of obscure races, so no need to to validate
3013 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3015 } else if (rc != 0) {
3016 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3017 "%s: revalidate FID "DFID" error: rc = %d\n",
3018 ll_get_fsname(inode->i_sb, NULL, 0),
3019 PFID(ll_inode2fid(inode)), rc);
3025 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3027 struct inode *inode = dentry->d_inode;
3028 struct ptlrpc_request *req = NULL;
3029 struct obd_export *exp;
3033 LASSERT(inode != NULL);
3035 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3036 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3038 exp = ll_i2mdexp(inode);
3040 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3041 * But under CMD case, it caused some lock issues, should be fixed
3042 * with new CMD ibits lock. See bug 12718 */
3043 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3044 struct lookup_intent oit = { .it_op = IT_GETATTR };
3045 struct md_op_data *op_data;
3047 if (ibits == MDS_INODELOCK_LOOKUP)
3048 oit.it_op = IT_LOOKUP;
3050 /* Call getattr by fid, so do not provide name at all. */
3051 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3052 dentry->d_inode, NULL, 0, 0,
3053 LUSTRE_OPC_ANY, NULL);
3054 if (IS_ERR(op_data))
3055 RETURN(PTR_ERR(op_data));
3057 rc = md_intent_lock(exp, op_data, &oit, &req,
3058 &ll_md_blocking_ast, 0);
3059 ll_finish_md_op_data(op_data);
3061 rc = ll_inode_revalidate_fini(inode, rc);
3065 rc = ll_revalidate_it_finish(req, &oit, dentry);
3067 ll_intent_release(&oit);
3071 /* Unlinked? Unhash dentry, so it is not picked up later by
3072 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3073 here to preserve get_cwd functionality on 2.6.
3075 if (!dentry->d_inode->i_nlink)
3076 d_lustre_invalidate(dentry, 0);
3078 ll_lookup_finish_locks(&oit, dentry);
3079 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3080 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3081 u64 valid = OBD_MD_FLGETATTR;
3082 struct md_op_data *op_data;
3085 if (S_ISREG(inode->i_mode)) {
3086 rc = ll_get_default_mdsize(sbi, &ealen);
3089 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3092 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3093 0, ealen, LUSTRE_OPC_ANY,
3095 if (IS_ERR(op_data))
3096 RETURN(PTR_ERR(op_data));
3098 op_data->op_valid = valid;
3099 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3100 * capa for this inode. Because we only keep capas of dirs
3102 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3103 ll_finish_md_op_data(op_data);
3105 rc = ll_inode_revalidate_fini(inode, rc);
3109 rc = ll_prep_inode(&inode, req, NULL, NULL);
3112 ptlrpc_req_finished(req);
3116 static int ll_merge_md_attr(struct inode *inode)
3118 struct cl_attr attr = { 0 };
3121 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3122 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3123 &attr, ll_md_blocking_ast);
3127 set_nlink(inode, attr.cat_nlink);
3128 inode->i_blocks = attr.cat_blocks;
3129 i_size_write(inode, attr.cat_size);
3131 ll_i2info(inode)->lli_atime = attr.cat_atime;
3132 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
3133 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
3139 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3141 struct inode *inode = dentry->d_inode;
3145 rc = __ll_inode_revalidate(dentry, ibits);
3149 /* if object isn't regular file, don't validate size */
3150 if (!S_ISREG(inode->i_mode)) {
3151 if (S_ISDIR(inode->i_mode) &&
3152 ll_i2info(inode)->lli_lsm_md != NULL) {
3153 rc = ll_merge_md_attr(inode);
3158 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime;
3159 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime;
3160 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime;
3162 /* In case of restore, the MDT has the right size and has
3163 * already send it back without granting the layout lock,
3164 * inode is up-to-date so glimpse is useless.
3165 * Also to glimpse we need the layout, in case of a running
3166 * restore the MDT holds the layout lock so the glimpse will
3167 * block up to the end of restore (getattr will block)
3169 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3170 rc = ll_glimpse_size(inode);
3175 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3177 struct inode *inode = de->d_inode;
3178 struct ll_sb_info *sbi = ll_i2sbi(inode);
3179 struct ll_inode_info *lli = ll_i2info(inode);
3182 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3183 MDS_INODELOCK_LOOKUP);
3184 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3189 stat->dev = inode->i_sb->s_dev;
3190 if (ll_need_32bit_api(sbi))
3191 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3193 stat->ino = inode->i_ino;
3194 stat->mode = inode->i_mode;
3195 stat->uid = inode->i_uid;
3196 stat->gid = inode->i_gid;
3197 stat->rdev = inode->i_rdev;
3198 stat->atime = inode->i_atime;
3199 stat->mtime = inode->i_mtime;
3200 stat->ctime = inode->i_ctime;
3201 stat->blksize = 1 << inode->i_blkbits;
3203 stat->nlink = inode->i_nlink;
3204 stat->size = i_size_read(inode);
3205 stat->blocks = inode->i_blocks;
3210 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3211 __u64 start, __u64 len)
3215 struct ll_user_fiemap *fiemap;
3216 unsigned int extent_count = fieinfo->fi_extents_max;
3218 num_bytes = sizeof(*fiemap) + (extent_count *
3219 sizeof(struct ll_fiemap_extent));
3220 OBD_ALLOC_LARGE(fiemap, num_bytes);
3225 fiemap->fm_flags = fieinfo->fi_flags;
3226 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3227 fiemap->fm_start = start;
3228 fiemap->fm_length = len;
3229 if (extent_count > 0)
3230 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3231 sizeof(struct ll_fiemap_extent));
3233 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3235 fieinfo->fi_flags = fiemap->fm_flags;
3236 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3237 if (extent_count > 0)
3238 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3239 fiemap->fm_mapped_extents *
3240 sizeof(struct ll_fiemap_extent));
3242 OBD_FREE_LARGE(fiemap, num_bytes);
3246 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3248 struct ll_inode_info *lli = ll_i2info(inode);
3249 struct posix_acl *acl = NULL;
3252 spin_lock(&lli->lli_lock);
3253 /* VFS' acl_permission_check->check_acl will release the refcount */
3254 acl = posix_acl_dup(lli->lli_posix_acl);
3255 spin_unlock(&lli->lli_lock);
3260 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3262 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3263 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3265 ll_check_acl(struct inode *inode, int mask)
3268 # ifdef CONFIG_FS_POSIX_ACL
3269 struct posix_acl *acl;
3273 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3274 if (flags & IPERM_FLAG_RCU)
3277 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3282 rc = posix_acl_permission(inode, acl, mask);
3283 posix_acl_release(acl);
3286 # else /* !CONFIG_FS_POSIX_ACL */
3288 # endif /* CONFIG_FS_POSIX_ACL */
3290 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3292 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3293 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3295 # ifdef HAVE_INODE_PERMISION_2ARGS
3296 int ll_inode_permission(struct inode *inode, int mask)
3298 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3303 struct ll_sb_info *sbi;
3304 struct root_squash_info *squash;
3305 struct cred *cred = NULL;
3306 const struct cred *old_cred = NULL;
3308 bool squash_id = false;
3311 #ifdef MAY_NOT_BLOCK
3312 if (mask & MAY_NOT_BLOCK)
3314 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3315 if (flags & IPERM_FLAG_RCU)
3319 /* as root inode are NOT getting validated in lookup operation,
3320 * need to do it before permission check. */
3322 if (inode == inode->i_sb->s_root->d_inode) {
3323 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3324 MDS_INODELOCK_LOOKUP);
3329 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3330 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3332 /* squash fsuid/fsgid if needed */
3333 sbi = ll_i2sbi(inode);
3334 squash = &sbi->ll_squash;
3335 if (unlikely(squash->rsi_uid != 0 &&
3336 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3337 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3341 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3342 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3343 squash->rsi_uid, squash->rsi_gid);
3345 /* update current process's credentials
3346 * and FS capability */
3347 cred = prepare_creds();
3351 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3352 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3353 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3354 if ((1 << cap) & CFS_CAP_FS_MASK)
3355 cap_lower(cred->cap_effective, cap);
3357 old_cred = override_creds(cred);
3360 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3362 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3363 rc = lustre_check_remote_perm(inode, mask);
3365 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3367 /* restore current process's credentials and FS capability */
3369 revert_creds(old_cred);
3376 /* -o localflock - only provides locally consistent flock locks */
3377 struct file_operations ll_file_operations = {
3378 .read = ll_file_read,
3379 .aio_read = ll_file_aio_read,
3380 .write = ll_file_write,
3381 .aio_write = ll_file_aio_write,
3382 .unlocked_ioctl = ll_file_ioctl,
3383 .open = ll_file_open,
3384 .release = ll_file_release,
3385 .mmap = ll_file_mmap,
3386 .llseek = ll_file_seek,
3387 .splice_read = ll_file_splice_read,
3392 struct file_operations ll_file_operations_flock = {
3393 .read = ll_file_read,
3394 .aio_read = ll_file_aio_read,
3395 .write = ll_file_write,
3396 .aio_write = ll_file_aio_write,
3397 .unlocked_ioctl = ll_file_ioctl,
3398 .open = ll_file_open,
3399 .release = ll_file_release,
3400 .mmap = ll_file_mmap,
3401 .llseek = ll_file_seek,
3402 .splice_read = ll_file_splice_read,
3405 .flock = ll_file_flock,
3406 .lock = ll_file_flock
3409 /* These are for -o noflock - to return ENOSYS on flock calls */
3410 struct file_operations ll_file_operations_noflock = {
3411 .read = ll_file_read,
3412 .aio_read = ll_file_aio_read,
3413 .write = ll_file_write,
3414 .aio_write = ll_file_aio_write,
3415 .unlocked_ioctl = ll_file_ioctl,
3416 .open = ll_file_open,
3417 .release = ll_file_release,
3418 .mmap = ll_file_mmap,
3419 .llseek = ll_file_seek,
3420 .splice_read = ll_file_splice_read,
3423 .flock = ll_file_noflock,
3424 .lock = ll_file_noflock
3427 struct inode_operations ll_file_inode_operations = {
3428 .setattr = ll_setattr,
3429 .getattr = ll_getattr,
3430 .permission = ll_inode_permission,
3431 .setxattr = ll_setxattr,
3432 .getxattr = ll_getxattr,
3433 .listxattr = ll_listxattr,
3434 .removexattr = ll_removexattr,
3435 .fiemap = ll_fiemap,
3436 #ifdef HAVE_IOP_GET_ACL
3437 .get_acl = ll_get_acl,
3441 /* dynamic ioctl number support routins */
3442 static struct llioc_ctl_data {
3443 struct rw_semaphore ioc_sem;
3444 struct list_head ioc_head;
3446 __RWSEM_INITIALIZER(llioc.ioc_sem),
3447 LIST_HEAD_INIT(llioc.ioc_head)
3452 struct list_head iocd_list;
3453 unsigned int iocd_size;
3454 llioc_callback_t iocd_cb;
3455 unsigned int iocd_count;
3456 unsigned int iocd_cmd[0];
3459 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3462 struct llioc_data *in_data = NULL;
3465 if (cb == NULL || cmd == NULL ||
3466 count > LLIOC_MAX_CMD || count < 0)
3469 size = sizeof(*in_data) + count * sizeof(unsigned int);
3470 OBD_ALLOC(in_data, size);
3471 if (in_data == NULL)
3474 memset(in_data, 0, sizeof(*in_data));
3475 in_data->iocd_size = size;
3476 in_data->iocd_cb = cb;
3477 in_data->iocd_count = count;
3478 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3480 down_write(&llioc.ioc_sem);
3481 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3482 up_write(&llioc.ioc_sem);
3487 void ll_iocontrol_unregister(void *magic)
3489 struct llioc_data *tmp;
3494 down_write(&llioc.ioc_sem);
3495 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3497 unsigned int size = tmp->iocd_size;
3499 list_del(&tmp->iocd_list);
3500 up_write(&llioc.ioc_sem);
3502 OBD_FREE(tmp, size);
3506 up_write(&llioc.ioc_sem);
3508 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3511 EXPORT_SYMBOL(ll_iocontrol_register);
3512 EXPORT_SYMBOL(ll_iocontrol_unregister);
3514 static enum llioc_iter
3515 ll_iocontrol_call(struct inode *inode, struct file *file,
3516 unsigned int cmd, unsigned long arg, int *rcp)
3518 enum llioc_iter ret = LLIOC_CONT;
3519 struct llioc_data *data;
3520 int rc = -EINVAL, i;
3522 down_read(&llioc.ioc_sem);
3523 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3524 for (i = 0; i < data->iocd_count; i++) {
3525 if (cmd != data->iocd_cmd[i])
3528 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3532 if (ret == LLIOC_STOP)
3535 up_read(&llioc.ioc_sem);
3542 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3544 struct ll_inode_info *lli = ll_i2info(inode);
3545 struct cl_object *obj = lli->lli_clob;
3546 struct cl_env_nest nest;
3554 env = cl_env_nested_get(&nest);
3556 RETURN(PTR_ERR(env));
3558 rc = cl_conf_set(env, lli->lli_clob, conf);
3562 if (conf->coc_opc == OBJECT_CONF_SET) {
3563 struct ldlm_lock *lock = conf->coc_lock;
3564 struct cl_layout cl = {
3568 LASSERT(lock != NULL);
3569 LASSERT(ldlm_has_layout(lock));
3571 /* it can only be allowed to match after layout is
3572 * applied to inode otherwise false layout would be
3573 * seen. Applying layout shoud happen before dropping
3574 * the intent lock. */
3575 ldlm_lock_allow_match(lock);
3577 rc = cl_object_layout_get(env, obj, &cl);
3582 DFID": layout version change: %u -> %u\n",
3583 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3585 ll_layout_version_set(lli, cl.cl_layout_gen);
3589 cl_env_nested_put(&nest, env);
3594 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3595 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3598 struct ll_sb_info *sbi = ll_i2sbi(inode);
3599 struct obd_capa *oc;
3600 struct ptlrpc_request *req;
3601 struct mdt_body *body;
3608 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3609 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3610 lock->l_lvb_data, lock->l_lvb_len);
3612 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3615 /* if layout lock was granted right away, the layout is returned
3616 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3617 * blocked and then granted via completion ast, we have to fetch
3618 * layout here. Please note that we can't use the LVB buffer in
3619 * completion AST because it doesn't have a large enough buffer */
3620 oc = ll_mdscapa_get(inode);
3621 rc = ll_get_default_mdsize(sbi, &lmmsize);
3623 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3624 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3630 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3632 GOTO(out, rc = -EPROTO);
3634 lmmsize = body->mbo_eadatasize;
3635 if (lmmsize == 0) /* empty layout */
3638 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3640 GOTO(out, rc = -EFAULT);
3642 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3643 if (lvbdata == NULL)
3644 GOTO(out, rc = -ENOMEM);
3646 memcpy(lvbdata, lmm, lmmsize);
3647 lock_res_and_lock(lock);
3648 if (lock->l_lvb_data != NULL)
3649 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3651 lock->l_lvb_data = lvbdata;
3652 lock->l_lvb_len = lmmsize;
3653 unlock_res_and_lock(lock);
3658 ptlrpc_req_finished(req);
3663 * Apply the layout to the inode. Layout lock is held and will be released
3666 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3667 struct inode *inode)
3669 struct ll_inode_info *lli = ll_i2info(inode);
3670 struct ll_sb_info *sbi = ll_i2sbi(inode);
3671 struct ldlm_lock *lock;
3672 struct lustre_md md = { NULL };
3673 struct cl_object_conf conf;
3676 bool wait_layout = false;
3679 LASSERT(lustre_handle_is_used(lockh));
3681 lock = ldlm_handle2lock(lockh);
3682 LASSERT(lock != NULL);
3683 LASSERT(ldlm_has_layout(lock));
3685 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
3686 PFID(&lli->lli_fid), inode);
3688 /* in case this is a caching lock and reinstate with new inode */
3689 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3691 lock_res_and_lock(lock);
3692 lvb_ready = ldlm_is_lvb_ready(lock);
3693 unlock_res_and_lock(lock);
3694 /* checking lvb_ready is racy but this is okay. The worst case is
3695 * that multi processes may configure the file on the same time. */
3700 rc = ll_layout_fetch(inode, lock);
3704 /* for layout lock, lmm is returned in lock's lvb.
3705 * lvb_data is immutable if the lock is held so it's safe to access it
3706 * without res lock. See the description in ldlm_lock_decref_internal()
3707 * for the condition to free lvb_data of layout lock */
3708 if (lock->l_lvb_data != NULL) {
3709 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3710 lock->l_lvb_data, lock->l_lvb_len);
3712 CERROR("%s: file "DFID" unpackmd error: %d\n",
3713 ll_get_fsname(inode->i_sb, NULL, 0),
3714 PFID(&lli->lli_fid), rc);
3718 LASSERTF(md.lsm != NULL, "lvb_data = %p, lvb_len = %u\n",
3719 lock->l_lvb_data, lock->l_lvb_len);
3724 /* set layout to file. Unlikely this will fail as old layout was
3725 * surely eliminated */
3726 memset(&conf, 0, sizeof conf);
3727 conf.coc_opc = OBJECT_CONF_SET;
3728 conf.coc_inode = inode;
3729 conf.coc_lock = lock;
3730 conf.u.coc_md = &md;
3731 rc = ll_layout_conf(inode, &conf);
3734 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3736 /* refresh layout failed, need to wait */
3737 wait_layout = rc == -EBUSY;
3741 LDLM_LOCK_PUT(lock);
3742 ldlm_lock_decref(lockh, mode);
3744 /* wait for IO to complete if it's still being used. */
3746 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3747 ll_get_fsname(inode->i_sb, NULL, 0),
3748 PFID(&lli->lli_fid), inode);
3750 memset(&conf, 0, sizeof conf);
3751 conf.coc_opc = OBJECT_CONF_WAIT;
3752 conf.coc_inode = inode;
3753 rc = ll_layout_conf(inode, &conf);
3757 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
3758 ll_get_fsname(inode->i_sb, NULL, 0),
3759 PFID(&lli->lli_fid), rc);
3764 static int ll_layout_refresh_locked(struct inode *inode)
3766 struct ll_inode_info *lli = ll_i2info(inode);
3767 struct ll_sb_info *sbi = ll_i2sbi(inode);
3768 struct md_op_data *op_data;
3769 struct lookup_intent it;
3770 struct lustre_handle lockh;
3772 struct ldlm_enqueue_info einfo = {
3773 .ei_type = LDLM_IBITS,
3775 .ei_cb_bl = &ll_md_blocking_ast,
3776 .ei_cb_cp = &ldlm_completion_ast,
3782 /* mostly layout lock is caching on the local side, so try to match
3783 * it before grabbing layout lock mutex. */
3784 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3785 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3786 if (mode != 0) { /* hit cached lock */
3787 rc = ll_layout_lock_set(&lockh, mode, inode);
3794 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3795 0, 0, LUSTRE_OPC_ANY, NULL);
3796 if (IS_ERR(op_data))
3797 RETURN(PTR_ERR(op_data));
3799 /* have to enqueue one */
3800 memset(&it, 0, sizeof(it));
3801 it.it_op = IT_LAYOUT;
3802 lockh.cookie = 0ULL;
3804 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
3805 ll_get_fsname(inode->i_sb, NULL, 0),
3806 PFID(&lli->lli_fid), inode);
3808 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
3809 if (it.d.lustre.it_data != NULL)
3810 ptlrpc_req_finished(it.d.lustre.it_data);
3811 it.d.lustre.it_data = NULL;
3813 ll_finish_md_op_data(op_data);
3815 mode = it.d.lustre.it_lock_mode;
3816 it.d.lustre.it_lock_mode = 0;
3817 ll_intent_drop_lock(&it);
3820 /* set lock data in case this is a new lock */
3821 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3822 rc = ll_layout_lock_set(&lockh, mode, inode);
3831 * This function checks if there exists a LAYOUT lock on the client side,
3832 * or enqueues it if it doesn't have one in cache.
3834 * This function will not hold layout lock so it may be revoked any time after
3835 * this function returns. Any operations depend on layout should be redone
3838 * This function should be called before lov_io_init() to get an uptodate
3839 * layout version, the caller should save the version number and after IO
3840 * is finished, this function should be called again to verify that layout
3841 * is not changed during IO time.
3843 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3845 struct ll_inode_info *lli = ll_i2info(inode);
3846 struct ll_sb_info *sbi = ll_i2sbi(inode);
3850 *gen = ll_layout_version_get(lli);
3851 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
3855 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3856 LASSERT(S_ISREG(inode->i_mode));
3858 /* take layout lock mutex to enqueue layout lock exclusively. */
3859 mutex_lock(&lli->lli_layout_mutex);
3861 rc = ll_layout_refresh_locked(inode);
3865 *gen = ll_layout_version_get(lli);
3867 mutex_unlock(&lli->lli_layout_mutex);
3873 * This function send a restore request to the MDT
3875 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
3877 struct hsm_user_request *hur;
3881 len = sizeof(struct hsm_user_request) +
3882 sizeof(struct hsm_user_item);
3883 OBD_ALLOC(hur, len);
3887 hur->hur_request.hr_action = HUA_RESTORE;
3888 hur->hur_request.hr_archive_id = 0;
3889 hur->hur_request.hr_flags = 0;
3890 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3891 sizeof(hur->hur_user_item[0].hui_fid));
3892 hur->hur_user_item[0].hui_extent.offset = offset;
3893 hur->hur_user_item[0].hui_extent.length = length;
3894 hur->hur_request.hr_itemcount = 1;
3895 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,