4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och)
125 struct obd_export *exp = ll_i2mdexp(inode);
126 struct md_op_data *op_data;
127 struct ptlrpc_request *req = NULL;
128 struct obd_device *obd = class_exp2obd(exp);
135 * XXX: in case of LMV, is this correct to access
138 CERROR("Invalid MDC connection handle "LPX64"\n",
139 ll_i2mdexp(inode)->exp_handle.h_cookie);
143 OBD_ALLOC_PTR(op_data);
145 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
147 ll_prepare_close(inode, op_data, och);
148 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
149 rc = md_close(md_exp, op_data, och->och_mod, &req);
151 /* This close must have the epoch closed. */
152 LASSERT(epoch_close);
153 /* MDS has instructed us to obtain Size-on-MDS attribute from
154 * OSTs and send setattr to back to MDS. */
155 rc = ll_som_update(inode, op_data);
157 CERROR("inode %lu mdc Size-on-MDS update failed: "
158 "rc = %d\n", inode->i_ino, rc);
162 CERROR("inode %lu mdc close failed: rc = %d\n",
166 /* DATA_MODIFIED flag was successfully sent on close, cancel data
167 * modification flag. */
168 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
169 struct ll_inode_info *lli = ll_i2info(inode);
171 spin_lock(&lli->lli_lock);
172 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
173 spin_unlock(&lli->lli_lock);
176 ll_finish_md_op_data(op_data);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
188 if (exp_connect_som(exp) && !epoch_close &&
189 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
190 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
192 md_clear_open_replay_data(md_exp, och);
193 /* Free @och if it is not waiting for DONE_WRITING. */
194 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
197 if (req) /* This is close request */
198 ptlrpc_req_finished(req);
202 int ll_md_real_close(struct inode *inode, int flags)
204 struct ll_inode_info *lli = ll_i2info(inode);
205 struct obd_client_handle **och_p;
206 struct obd_client_handle *och;
211 if (flags & FMODE_WRITE) {
212 och_p = &lli->lli_mds_write_och;
213 och_usecount = &lli->lli_open_fd_write_count;
214 } else if (flags & FMODE_EXEC) {
215 och_p = &lli->lli_mds_exec_och;
216 och_usecount = &lli->lli_open_fd_exec_count;
218 LASSERT(flags & FMODE_READ);
219 och_p = &lli->lli_mds_read_och;
220 och_usecount = &lli->lli_open_fd_read_count;
223 mutex_lock(&lli->lli_och_mutex);
224 if (*och_usecount) { /* There are still users of this handle, so
226 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
233 if (och) { /* There might be a race and somebody have freed this och
235 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
242 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
254 if (fd->fd_lease_och != NULL) {
257 /* Usually the lease is not released when the
258 * application crashed, we need to release here. */
259 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
260 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
261 PFID(&lli->lli_fid), rc, lease_broken);
263 fd->fd_lease_och = NULL;
266 if (fd->fd_och != NULL) {
267 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och);
272 /* Let's see if we have good enough OPEN lock on the file and if
273 we can skip talking to MDS */
274 if (file->f_dentry->d_inode) { /* Can this ever be false? */
276 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
277 struct lustre_handle lockh;
278 struct inode *inode = file->f_dentry->d_inode;
279 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
281 mutex_lock(&lli->lli_och_mutex);
282 if (fd->fd_omode & FMODE_WRITE) {
284 LASSERT(lli->lli_open_fd_write_count);
285 lli->lli_open_fd_write_count--;
286 } else if (fd->fd_omode & FMODE_EXEC) {
288 LASSERT(lli->lli_open_fd_exec_count);
289 lli->lli_open_fd_exec_count--;
292 LASSERT(lli->lli_open_fd_read_count);
293 lli->lli_open_fd_read_count--;
295 mutex_unlock(&lli->lli_och_mutex);
297 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
298 LDLM_IBITS, &policy, lockmode,
300 rc = ll_md_real_close(file->f_dentry->d_inode,
304 CERROR("Releasing a file %p with negative dentry %p. Name %s",
305 file, file->f_dentry, file->f_dentry->d_name.name);
309 LUSTRE_FPRIVATE(file) = NULL;
310 ll_file_data_put(fd);
311 ll_capa_close(inode);
316 /* While this returns an error code, fput() the caller does not, so we need
317 * to make every effort to clean up all of our state here. Also, applications
318 * rarely check close errors and even if an error is returned they will not
319 * re-try the close call.
321 int ll_file_release(struct inode *inode, struct file *file)
323 struct ll_file_data *fd;
324 struct ll_sb_info *sbi = ll_i2sbi(inode);
325 struct ll_inode_info *lli = ll_i2info(inode);
329 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
330 inode->i_generation, inode);
332 #ifdef CONFIG_FS_POSIX_ACL
333 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
334 inode == inode->i_sb->s_root->d_inode) {
335 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
338 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
339 fd->fd_flags &= ~LL_FILE_RMTACL;
340 rct_del(&sbi->ll_rct, cfs_curproc_pid());
341 et_search_free(&sbi->ll_et, cfs_curproc_pid());
346 if (inode->i_sb->s_root != file->f_dentry)
347 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
348 fd = LUSTRE_FPRIVATE(file);
351 /* The last ref on @file, maybe not the the owner pid of statahead.
352 * Different processes can open the same dir, "ll_opendir_key" means:
353 * it is me that should stop the statahead thread. */
354 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
355 lli->lli_opendir_pid != 0)
356 ll_stop_statahead(inode, lli->lli_opendir_key);
358 if (inode->i_sb->s_root == file->f_dentry) {
359 LUSTRE_FPRIVATE(file) = NULL;
360 ll_file_data_put(fd);
364 if (!S_ISDIR(inode->i_mode)) {
365 lov_read_and_clear_async_rc(lli->lli_clob);
366 lli->lli_async_rc = 0;
369 rc = ll_md_close(sbi->ll_md_exp, inode, file);
371 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
372 libcfs_debug_dumplog();
377 static int ll_intent_file_open(struct file *file, void *lmm,
378 int lmmsize, struct lookup_intent *itp)
380 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
381 struct dentry *parent = file->f_dentry->d_parent;
382 struct md_op_data *op_data;
383 struct ptlrpc_request *req;
384 __u32 opc = LUSTRE_OPC_ANY;
391 /* Usually we come here only for NFSD, and we want open lock.
392 But we can also get here with pre 2.6.15 patchless kernels, and in
393 that case that lock is also ok */
394 /* We can also get here if there was cached open handle in revalidate_it
395 * but it disappeared while we were getting from there to ll_file_open.
396 * But this means this file was closed and immediatelly opened which
397 * makes a good candidate for using OPEN lock */
398 /* If lmmsize & lmm are not 0, we are just setting stripe info
399 * parameters. No need for the open lock */
400 if (lmm == NULL && lmmsize == 0) {
401 itp->it_flags |= MDS_OPEN_LOCK;
402 if (itp->it_flags & FMODE_WRITE)
403 opc = LUSTRE_OPC_CREATE;
406 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
407 file->f_dentry->d_inode, NULL, 0,
411 RETURN(PTR_ERR(op_data));
413 itp->it_flags |= MDS_OPEN_BY_FID;
414 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
415 0 /*unused */, &req, ll_md_blocking_ast, 0);
416 ll_finish_md_op_data(op_data);
418 /* reason for keep own exit path - don`t flood log
419 * with messages with -ESTALE errors.
421 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
422 it_open_error(DISP_OPEN_OPEN, itp))
424 ll_release_openhandle(file->f_dentry, itp);
428 if (it_disposition(itp, DISP_LOOKUP_NEG))
429 GOTO(out, rc = -ENOENT);
431 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
432 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
433 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
437 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
438 if (!rc && itp->d.lustre.it_lock_mode)
439 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
443 ptlrpc_req_finished(itp->d.lustre.it_data);
444 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
445 ll_intent_drop_lock(itp);
451 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
452 * not believe attributes if a few ioepoch holders exist. Attributes for
453 * previous ioepoch if new one is opened are also skipped by MDS.
455 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
457 if (ioepoch && lli->lli_ioepoch != ioepoch) {
458 lli->lli_ioepoch = ioepoch;
459 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
460 ioepoch, PFID(&lli->lli_fid));
464 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
465 struct obd_client_handle *och)
467 struct ptlrpc_request *req = it->d.lustre.it_data;
468 struct mdt_body *body;
470 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
471 och->och_fh = body->handle;
472 och->och_fid = body->fid1;
473 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
474 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
475 och->och_flags = it->it_flags;
477 return md_set_open_replay_data(md_exp, och, req);
480 int ll_local_open(struct file *file, struct lookup_intent *it,
481 struct ll_file_data *fd, struct obd_client_handle *och)
483 struct inode *inode = file->f_dentry->d_inode;
484 struct ll_inode_info *lli = ll_i2info(inode);
487 LASSERT(!LUSTRE_FPRIVATE(file));
492 struct ptlrpc_request *req = it->d.lustre.it_data;
493 struct mdt_body *body;
496 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
500 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
501 ll_ioepoch_open(lli, body->ioepoch);
504 LUSTRE_FPRIVATE(file) = fd;
505 ll_readahead_init(inode, &fd->fd_ras);
506 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
511 /* Open a file, and (for the very first open) create objects on the OSTs at
512 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
513 * creation or open until ll_lov_setstripe() ioctl is called.
515 * If we already have the stripe MD locally then we don't request it in
516 * md_open(), by passing a lmm_size = 0.
518 * It is up to the application to ensure no other processes open this file
519 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
520 * used. We might be able to avoid races of that sort by getting lli_open_sem
521 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
522 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
524 int ll_file_open(struct inode *inode, struct file *file)
526 struct ll_inode_info *lli = ll_i2info(inode);
527 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
528 .it_flags = file->f_flags };
529 struct obd_client_handle **och_p = NULL;
530 __u64 *och_usecount = NULL;
531 struct ll_file_data *fd;
532 int rc = 0, opendir_set = 0;
535 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
536 inode->i_generation, inode, file->f_flags);
538 it = file->private_data; /* XXX: compat macro */
539 file->private_data = NULL; /* prevent ll_local_open assertion */
541 fd = ll_file_data_get();
543 GOTO(out_openerr, rc = -ENOMEM);
546 if (S_ISDIR(inode->i_mode)) {
547 spin_lock(&lli->lli_sa_lock);
548 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
549 lli->lli_opendir_pid == 0) {
550 lli->lli_opendir_key = fd;
551 lli->lli_opendir_pid = cfs_curproc_pid();
554 spin_unlock(&lli->lli_sa_lock);
557 if (inode->i_sb->s_root == file->f_dentry) {
558 LUSTRE_FPRIVATE(file) = fd;
562 if (!it || !it->d.lustre.it_disposition) {
563 /* Convert f_flags into access mode. We cannot use file->f_mode,
564 * because everything but O_ACCMODE mask was stripped from
566 if ((oit.it_flags + 1) & O_ACCMODE)
568 if (file->f_flags & O_TRUNC)
569 oit.it_flags |= FMODE_WRITE;
571 /* kernel only call f_op->open in dentry_open. filp_open calls
572 * dentry_open after call to open_namei that checks permissions.
573 * Only nfsd_open call dentry_open directly without checking
574 * permissions and because of that this code below is safe. */
575 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
576 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
578 /* We do not want O_EXCL here, presumably we opened the file
579 * already? XXX - NFS implications? */
580 oit.it_flags &= ~O_EXCL;
582 /* bug20584, if "it_flags" contains O_CREAT, the file will be
583 * created if necessary, then "IT_CREAT" should be set to keep
584 * consistent with it */
585 if (oit.it_flags & O_CREAT)
586 oit.it_op |= IT_CREAT;
592 /* Let's see if we have file open on MDS already. */
593 if (it->it_flags & FMODE_WRITE) {
594 och_p = &lli->lli_mds_write_och;
595 och_usecount = &lli->lli_open_fd_write_count;
596 } else if (it->it_flags & FMODE_EXEC) {
597 och_p = &lli->lli_mds_exec_och;
598 och_usecount = &lli->lli_open_fd_exec_count;
600 och_p = &lli->lli_mds_read_och;
601 och_usecount = &lli->lli_open_fd_read_count;
604 mutex_lock(&lli->lli_och_mutex);
605 if (*och_p) { /* Open handle is present */
606 if (it_disposition(it, DISP_OPEN_OPEN)) {
607 /* Well, there's extra open request that we do not need,
608 let's close it somehow. This will decref request. */
609 rc = it_open_error(DISP_OPEN_OPEN, it);
611 mutex_unlock(&lli->lli_och_mutex);
612 GOTO(out_openerr, rc);
615 ll_release_openhandle(file->f_dentry, it);
619 rc = ll_local_open(file, it, fd, NULL);
622 mutex_unlock(&lli->lli_och_mutex);
623 GOTO(out_openerr, rc);
626 LASSERT(*och_usecount == 0);
627 if (!it->d.lustre.it_disposition) {
628 /* We cannot just request lock handle now, new ELC code
629 means that one of other OPEN locks for this file
630 could be cancelled, and since blocking ast handler
631 would attempt to grab och_mutex as well, that would
632 result in a deadlock */
633 mutex_unlock(&lli->lli_och_mutex);
634 it->it_create_mode |= M_CHECK_STALE;
635 rc = ll_intent_file_open(file, NULL, 0, it);
636 it->it_create_mode &= ~M_CHECK_STALE;
638 GOTO(out_openerr, rc);
642 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
644 GOTO(out_och_free, rc = -ENOMEM);
648 /* md_intent_lock() didn't get a request ref if there was an
649 * open error, so don't do cleanup on the request here
651 /* XXX (green): Should not we bail out on any error here, not
652 * just open error? */
653 rc = it_open_error(DISP_OPEN_OPEN, it);
655 GOTO(out_och_free, rc);
657 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
659 rc = ll_local_open(file, it, fd, *och_p);
661 GOTO(out_och_free, rc);
663 mutex_unlock(&lli->lli_och_mutex);
666 /* Must do this outside lli_och_mutex lock to prevent deadlock where
667 different kind of OPEN lock for this same inode gets cancelled
668 by ldlm_cancel_lru */
669 if (!S_ISREG(inode->i_mode))
670 GOTO(out_och_free, rc);
674 if (!lli->lli_has_smd) {
675 if (file->f_flags & O_LOV_DELAY_CREATE ||
676 !(file->f_mode & FMODE_WRITE)) {
677 CDEBUG(D_INODE, "object creation was delayed\n");
678 GOTO(out_och_free, rc);
681 file->f_flags &= ~O_LOV_DELAY_CREATE;
682 GOTO(out_och_free, rc);
686 if (och_p && *och_p) {
687 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
688 *och_p = NULL; /* OBD_FREE writes some magic there */
691 mutex_unlock(&lli->lli_och_mutex);
694 if (opendir_set != 0)
695 ll_stop_statahead(inode, lli->lli_opendir_key);
697 ll_file_data_put(fd);
699 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
702 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
703 ptlrpc_req_finished(it->d.lustre.it_data);
704 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
710 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
711 struct ldlm_lock_desc *desc, void *data, int flag)
714 struct lustre_handle lockh;
718 case LDLM_CB_BLOCKING:
719 ldlm_lock2handle(lock, &lockh);
720 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
722 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
726 case LDLM_CB_CANCELING:
734 * Acquire a lease and open the file.
736 struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
739 struct lookup_intent it = { .it_op = IT_OPEN };
740 struct ll_sb_info *sbi = ll_i2sbi(inode);
741 struct md_op_data *op_data;
742 struct ptlrpc_request *req;
743 struct lustre_handle old_handle = { 0 };
744 struct obd_client_handle *och = NULL;
749 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
750 RETURN(ERR_PTR(-EINVAL));
753 struct ll_inode_info *lli = ll_i2info(inode);
754 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
755 struct obd_client_handle **och_p;
758 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
759 RETURN(ERR_PTR(-EPERM));
761 /* Get the openhandle of the file */
763 mutex_lock(&lli->lli_och_mutex);
764 if (fd->fd_lease_och != NULL) {
765 mutex_unlock(&lli->lli_och_mutex);
769 if (fd->fd_och == NULL) {
770 if (file->f_mode & FMODE_WRITE) {
771 LASSERT(lli->lli_mds_write_och != NULL);
772 och_p = &lli->lli_mds_write_och;
773 och_usecount = &lli->lli_open_fd_write_count;
775 LASSERT(lli->lli_mds_read_och != NULL);
776 och_p = &lli->lli_mds_read_och;
777 och_usecount = &lli->lli_open_fd_read_count;
779 if (*och_usecount == 1) {
786 mutex_unlock(&lli->lli_och_mutex);
787 if (rc < 0) /* more than 1 opener */
790 LASSERT(fd->fd_och != NULL);
791 old_handle = fd->fd_och->och_fh;
796 RETURN(ERR_PTR(-ENOMEM));
798 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
799 LUSTRE_OPC_ANY, NULL);
801 GOTO(out, rc = PTR_ERR(op_data));
803 /* To tell the MDT this openhandle is from the same owner */
804 op_data->op_handle = old_handle;
806 it.it_flags = fmode | MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
807 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
808 ll_md_blocking_lease_ast,
809 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
810 * it can be cancelled which may mislead applications that the lease is
812 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
813 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
814 * doesn't deal with openhandle, so normal openhandle will be leaked. */
815 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
816 ll_finish_md_op_data(op_data);
818 ptlrpc_req_finished(req);
819 it_clear_disposition(&it, DISP_ENQ_COMPLETE);
822 GOTO(out_release_it, rc);
824 if (it_disposition(&it, DISP_LOOKUP_NEG))
825 GOTO(out_release_it, rc = -ENOENT);
827 rc = it_open_error(DISP_OPEN_OPEN, &it);
829 GOTO(out_release_it, rc);
831 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
832 ll_och_fill(sbi->ll_md_exp, &it, och);
834 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
835 GOTO(out_close, rc = -EOPNOTSUPP);
837 /* already get lease, handle lease lock */
838 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
839 if (it.d.lustre.it_lock_mode == 0 ||
840 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
841 /* open lock must return for lease */
842 CERROR(DFID "lease granted but no open lock, %d/%Lu.\n",
843 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
844 it.d.lustre.it_lock_bits);
845 GOTO(out_close, rc = -EPROTO);
848 ll_intent_release(&it);
852 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och);
854 CERROR("Close openhandle returned %d\n", rc2);
856 /* cancel open lock */
857 if (it.d.lustre.it_lock_mode != 0) {
858 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
859 it.d.lustre.it_lock_mode);
860 it.d.lustre.it_lock_mode = 0;
863 ll_intent_release(&it);
868 EXPORT_SYMBOL(ll_lease_open);
871 * Release lease and close the file.
872 * It will check if the lease has ever broken.
874 int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
877 struct ldlm_lock *lock;
878 bool cancelled = true;
882 lock = ldlm_handle2lock(&och->och_lease_handle);
884 lock_res_and_lock(lock);
885 cancelled = ldlm_is_cancel(lock);
886 unlock_res_and_lock(lock);
890 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
891 PFID(&ll_i2info(inode)->lli_fid), cancelled);
894 ldlm_cli_cancel(&och->och_lease_handle, 0);
895 if (lease_broken != NULL)
896 *lease_broken = cancelled;
898 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och);
901 EXPORT_SYMBOL(ll_lease_close);
903 /* Fills the obdo with the attributes for the lsm */
904 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
905 struct obd_capa *capa, struct obdo *obdo,
906 __u64 ioepoch, int sync)
908 struct ptlrpc_request_set *set;
909 struct obd_info oinfo = { { { 0 } } };
914 LASSERT(lsm != NULL);
918 oinfo.oi_oa->o_oi = lsm->lsm_oi;
919 oinfo.oi_oa->o_mode = S_IFREG;
920 oinfo.oi_oa->o_ioepoch = ioepoch;
921 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
922 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
923 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
924 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
925 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
926 OBD_MD_FLDATAVERSION;
927 oinfo.oi_capa = capa;
929 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
930 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
933 set = ptlrpc_prep_set();
935 CERROR("can't allocate ptlrpc set\n");
938 rc = obd_getattr_async(exp, &oinfo, set);
940 rc = ptlrpc_set_wait(set);
941 ptlrpc_set_destroy(set);
944 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
945 OBD_MD_FLATIME | OBD_MD_FLMTIME |
946 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
947 OBD_MD_FLDATAVERSION);
952 * Performs the getattr on the inode and updates its fields.
953 * If @sync != 0, perform the getattr under the server-side lock.
955 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
956 __u64 ioepoch, int sync)
958 struct obd_capa *capa = ll_mdscapa_get(inode);
959 struct lov_stripe_md *lsm;
963 lsm = ccc_inode_lsm_get(inode);
964 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
965 capa, obdo, ioepoch, sync);
968 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
970 obdo_refresh_inode(inode, obdo, obdo->o_valid);
971 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
972 " blksize %lu\n", POSTID(oi), i_size_read(inode),
973 (unsigned long long)inode->i_blocks,
974 (unsigned long)ll_inode_blksize(inode));
976 ccc_inode_lsm_put(inode, lsm);
980 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
982 struct ll_inode_info *lli = ll_i2info(inode);
983 struct cl_object *obj = lli->lli_clob;
984 struct cl_attr *attr = ccc_env_thread_attr(env);
990 ll_inode_size_lock(inode);
991 /* merge timestamps the most recently obtained from mds with
992 timestamps obtained from osts */
993 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
994 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
995 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
996 inode_init_lvb(inode, &lvb);
998 cl_object_attr_lock(obj);
999 rc = cl_object_attr_get(env, obj, attr);
1000 cl_object_attr_unlock(obj);
1003 if (lvb.lvb_atime < attr->cat_atime)
1004 lvb.lvb_atime = attr->cat_atime;
1005 if (lvb.lvb_ctime < attr->cat_ctime)
1006 lvb.lvb_ctime = attr->cat_ctime;
1007 if (lvb.lvb_mtime < attr->cat_mtime)
1008 lvb.lvb_mtime = attr->cat_mtime;
1010 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1011 PFID(&lli->lli_fid), attr->cat_size);
1012 cl_isize_write_nolock(inode, attr->cat_size);
1014 inode->i_blocks = attr->cat_blocks;
1016 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1017 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1018 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1020 ll_inode_size_unlock(inode);
1025 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1028 struct obdo obdo = { 0 };
1031 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1033 st->st_size = obdo.o_size;
1034 st->st_blocks = obdo.o_blocks;
1035 st->st_mtime = obdo.o_mtime;
1036 st->st_atime = obdo.o_atime;
1037 st->st_ctime = obdo.o_ctime;
1042 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1044 struct inode *inode = file->f_dentry->d_inode;
1046 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1048 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1049 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1050 file->f_flags & O_DIRECT ||
1053 io->ci_obj = ll_i2info(inode)->lli_clob;
1054 io->ci_lockreq = CILR_MAYBE;
1055 if (ll_file_nolock(file)) {
1056 io->ci_lockreq = CILR_NEVER;
1057 io->ci_no_srvlock = 1;
1058 } else if (file->f_flags & O_APPEND) {
1059 io->ci_lockreq = CILR_MANDATORY;
1064 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1065 struct file *file, enum cl_io_type iot,
1066 loff_t *ppos, size_t count)
1068 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1069 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1075 io = ccc_env_thread_io(env);
1076 ll_io_init(io, file, iot == CIT_WRITE);
1078 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1079 struct vvp_io *vio = vvp_env_io(env);
1080 struct ccc_io *cio = ccc_env_io(env);
1081 int write_mutex_locked = 0;
1083 cio->cui_fd = LUSTRE_FPRIVATE(file);
1084 vio->cui_io_subtype = args->via_io_subtype;
1086 switch (vio->cui_io_subtype) {
1088 cio->cui_iov = args->u.normal.via_iov;
1089 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1090 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1091 #ifndef HAVE_FILE_WRITEV
1092 cio->cui_iocb = args->u.normal.via_iocb;
1094 if ((iot == CIT_WRITE) &&
1095 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1096 if (mutex_lock_interruptible(&lli->
1098 GOTO(out, result = -ERESTARTSYS);
1099 write_mutex_locked = 1;
1100 } else if (iot == CIT_READ) {
1101 down_read(&lli->lli_trunc_sem);
1105 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1106 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1109 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1110 vio->u.splice.cui_flags = args->u.splice.via_flags;
1113 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1116 result = cl_io_loop(env, io);
1117 if (write_mutex_locked)
1118 mutex_unlock(&lli->lli_write_mutex);
1119 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1120 up_read(&lli->lli_trunc_sem);
1122 /* cl_io_rw_init() handled IO */
1123 result = io->ci_result;
1126 if (io->ci_nob > 0) {
1127 result = io->ci_nob;
1128 *ppos = io->u.ci_wr.wr.crw_pos;
1132 cl_io_fini(env, io);
1133 /* If any bit been read/written (result != 0), we just return
1134 * short read/write instead of restart io. */
1135 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1136 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1137 iot == CIT_READ ? "read" : "write",
1138 file->f_dentry->d_name.name, *ppos, count);
1139 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1143 if (iot == CIT_READ) {
1145 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1146 LPROC_LL_READ_BYTES, result);
1147 } else if (iot == CIT_WRITE) {
1149 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1150 LPROC_LL_WRITE_BYTES, result);
1151 fd->fd_write_failed = false;
1152 } else if (result != -ERESTARTSYS) {
1153 fd->fd_write_failed = true;
1162 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1164 static int ll_file_get_iov_count(const struct iovec *iov,
1165 unsigned long *nr_segs, size_t *count)
1170 for (seg = 0; seg < *nr_segs; seg++) {
1171 const struct iovec *iv = &iov[seg];
1174 * If any segment has a negative length, or the cumulative
1175 * length ever wraps negative then return -EINVAL.
1178 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1180 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1185 cnt -= iv->iov_len; /* This segment is no good */
1192 #ifdef HAVE_FILE_READV
1193 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
1194 unsigned long nr_segs, loff_t *ppos)
1197 struct vvp_io_args *args;
1203 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1207 env = cl_env_get(&refcheck);
1209 RETURN(PTR_ERR(env));
1211 args = vvp_env_args(env, IO_NORMAL);
1212 args->u.normal.via_iov = (struct iovec *)iov;
1213 args->u.normal.via_nrsegs = nr_segs;
1215 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1216 cl_env_put(env, &refcheck);
1220 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1224 struct iovec *local_iov;
1229 env = cl_env_get(&refcheck);
1231 RETURN(PTR_ERR(env));
1233 local_iov = &vvp_env_info(env)->vti_local_iov;
1234 local_iov->iov_base = (void __user *)buf;
1235 local_iov->iov_len = count;
1236 result = ll_file_readv(file, local_iov, 1, ppos);
1237 cl_env_put(env, &refcheck);
1242 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1243 unsigned long nr_segs, loff_t pos)
1246 struct vvp_io_args *args;
1252 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1256 env = cl_env_get(&refcheck);
1258 RETURN(PTR_ERR(env));
1260 args = vvp_env_args(env, IO_NORMAL);
1261 args->u.normal.via_iov = (struct iovec *)iov;
1262 args->u.normal.via_nrsegs = nr_segs;
1263 args->u.normal.via_iocb = iocb;
1265 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1266 &iocb->ki_pos, count);
1267 cl_env_put(env, &refcheck);
1271 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1275 struct iovec *local_iov;
1276 struct kiocb *kiocb;
1281 env = cl_env_get(&refcheck);
1283 RETURN(PTR_ERR(env));
1285 local_iov = &vvp_env_info(env)->vti_local_iov;
1286 kiocb = &vvp_env_info(env)->vti_kiocb;
1287 local_iov->iov_base = (void __user *)buf;
1288 local_iov->iov_len = count;
1289 init_sync_kiocb(kiocb, file);
1290 kiocb->ki_pos = *ppos;
1291 kiocb->ki_left = count;
1293 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1294 *ppos = kiocb->ki_pos;
1296 cl_env_put(env, &refcheck);
1302 * Write to a file (through the page cache).
1304 #ifdef HAVE_FILE_WRITEV
1305 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1306 unsigned long nr_segs, loff_t *ppos)
1309 struct vvp_io_args *args;
1315 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1319 env = cl_env_get(&refcheck);
1321 RETURN(PTR_ERR(env));
1323 args = vvp_env_args(env, IO_NORMAL);
1324 args->u.normal.via_iov = (struct iovec *)iov;
1325 args->u.normal.via_nrsegs = nr_segs;
1327 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1328 cl_env_put(env, &refcheck);
1332 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1336 struct iovec *local_iov;
1341 env = cl_env_get(&refcheck);
1343 RETURN(PTR_ERR(env));
1345 local_iov = &vvp_env_info(env)->vti_local_iov;
1346 local_iov->iov_base = (void __user *)buf;
1347 local_iov->iov_len = count;
1349 result = ll_file_writev(file, local_iov, 1, ppos);
1350 cl_env_put(env, &refcheck);
1354 #else /* AIO stuff */
1355 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1356 unsigned long nr_segs, loff_t pos)
1359 struct vvp_io_args *args;
1365 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1369 env = cl_env_get(&refcheck);
1371 RETURN(PTR_ERR(env));
1373 args = vvp_env_args(env, IO_NORMAL);
1374 args->u.normal.via_iov = (struct iovec *)iov;
1375 args->u.normal.via_nrsegs = nr_segs;
1376 args->u.normal.via_iocb = iocb;
1378 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1379 &iocb->ki_pos, count);
1380 cl_env_put(env, &refcheck);
1384 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1388 struct iovec *local_iov;
1389 struct kiocb *kiocb;
1394 env = cl_env_get(&refcheck);
1396 RETURN(PTR_ERR(env));
1398 local_iov = &vvp_env_info(env)->vti_local_iov;
1399 kiocb = &vvp_env_info(env)->vti_kiocb;
1400 local_iov->iov_base = (void __user *)buf;
1401 local_iov->iov_len = count;
1402 init_sync_kiocb(kiocb, file);
1403 kiocb->ki_pos = *ppos;
1404 kiocb->ki_left = count;
1406 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1407 *ppos = kiocb->ki_pos;
1409 cl_env_put(env, &refcheck);
1415 * Send file content (through pagecache) somewhere with helper
1417 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1418 struct pipe_inode_info *pipe, size_t count,
1422 struct vvp_io_args *args;
1427 env = cl_env_get(&refcheck);
1429 RETURN(PTR_ERR(env));
1431 args = vvp_env_args(env, IO_SPLICE);
1432 args->u.splice.via_pipe = pipe;
1433 args->u.splice.via_flags = flags;
1435 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1436 cl_env_put(env, &refcheck);
1440 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1443 struct obd_export *exp = ll_i2dtexp(inode);
1444 struct obd_trans_info oti = { 0 };
1445 struct obdo *oa = NULL;
1448 struct lov_stripe_md *lsm = NULL, *lsm2;
1455 lsm = ccc_inode_lsm_get(inode);
1456 if (!lsm_has_objects(lsm))
1457 GOTO(out, rc = -ENOENT);
1459 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1460 (lsm->lsm_stripe_count));
1462 OBD_ALLOC_LARGE(lsm2, lsm_size);
1464 GOTO(out, rc = -ENOMEM);
1467 oa->o_nlink = ost_idx;
1468 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1469 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1470 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1471 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1472 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1473 memcpy(lsm2, lsm, lsm_size);
1474 ll_inode_size_lock(inode);
1475 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1476 ll_inode_size_unlock(inode);
1478 OBD_FREE_LARGE(lsm2, lsm_size);
1481 ccc_inode_lsm_put(inode, lsm);
1486 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1488 struct ll_recreate_obj ucreat;
1492 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1495 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1499 ostid_set_seq_mdt0(&oi);
1500 ostid_set_id(&oi, ucreat.lrc_id);
1501 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1504 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1511 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1514 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1517 fid_to_ostid(&fid, &oi);
1518 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1519 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1522 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1523 int flags, struct lov_user_md *lum, int lum_size)
1525 struct lov_stripe_md *lsm = NULL;
1526 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1530 lsm = ccc_inode_lsm_get(inode);
1532 ccc_inode_lsm_put(inode, lsm);
1533 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1538 ll_inode_size_lock(inode);
1539 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1542 rc = oit.d.lustre.it_status;
1544 GOTO(out_req_free, rc);
1546 ll_release_openhandle(file->f_dentry, &oit);
1549 ll_inode_size_unlock(inode);
1550 ll_intent_release(&oit);
1551 ccc_inode_lsm_put(inode, lsm);
1554 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1558 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1559 struct lov_mds_md **lmmp, int *lmm_size,
1560 struct ptlrpc_request **request)
1562 struct ll_sb_info *sbi = ll_i2sbi(inode);
1563 struct mdt_body *body;
1564 struct lov_mds_md *lmm = NULL;
1565 struct ptlrpc_request *req = NULL;
1566 struct md_op_data *op_data;
1569 rc = ll_get_max_mdsize(sbi, &lmmsize);
1573 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1574 strlen(filename), lmmsize,
1575 LUSTRE_OPC_ANY, NULL);
1576 if (IS_ERR(op_data))
1577 RETURN(PTR_ERR(op_data));
1579 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1580 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1581 ll_finish_md_op_data(op_data);
1583 CDEBUG(D_INFO, "md_getattr_name failed "
1584 "on %s: rc %d\n", filename, rc);
1588 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1589 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1591 lmmsize = body->eadatasize;
1593 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1595 GOTO(out, rc = -ENODATA);
1598 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1599 LASSERT(lmm != NULL);
1601 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1602 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1603 GOTO(out, rc = -EPROTO);
1607 * This is coming from the MDS, so is probably in
1608 * little endian. We convert it to host endian before
1609 * passing it to userspace.
1611 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1614 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1615 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1618 /* if function called for directory - we should
1619 * avoid swab not existent lsm objects */
1620 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1621 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1622 if (S_ISREG(body->mode))
1623 lustre_swab_lov_user_md_objects(
1624 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1626 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1627 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1628 if (S_ISREG(body->mode))
1629 lustre_swab_lov_user_md_objects(
1630 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1637 *lmm_size = lmmsize;
1642 static int ll_lov_setea(struct inode *inode, struct file *file,
1645 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1646 struct lov_user_md *lump;
1647 int lum_size = sizeof(struct lov_user_md) +
1648 sizeof(struct lov_user_ost_data);
1652 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1655 OBD_ALLOC_LARGE(lump, lum_size);
1659 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1660 OBD_FREE_LARGE(lump, lum_size);
1664 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1666 OBD_FREE_LARGE(lump, lum_size);
1670 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1673 struct lov_user_md_v3 lumv3;
1674 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1675 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1676 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1678 int flags = FMODE_WRITE;
1681 /* first try with v1 which is smaller than v3 */
1682 lum_size = sizeof(struct lov_user_md_v1);
1683 if (copy_from_user(lumv1, lumv1p, lum_size))
1686 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1687 lum_size = sizeof(struct lov_user_md_v3);
1688 if (copy_from_user(&lumv3, lumv3p, lum_size))
1692 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1694 struct lov_stripe_md *lsm;
1697 put_user(0, &lumv1p->lmm_stripe_count);
1699 ll_layout_refresh(inode, &gen);
1700 lsm = ccc_inode_lsm_get(inode);
1701 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1702 0, lsm, (void *)arg);
1703 ccc_inode_lsm_put(inode, lsm);
1708 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1710 struct lov_stripe_md *lsm;
1714 lsm = ccc_inode_lsm_get(inode);
1716 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1718 ccc_inode_lsm_put(inode, lsm);
1722 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1724 struct ll_inode_info *lli = ll_i2info(inode);
1725 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1726 struct ccc_grouplock grouplock;
1730 if (ll_file_nolock(file))
1731 RETURN(-EOPNOTSUPP);
1733 spin_lock(&lli->lli_lock);
1734 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1735 CWARN("group lock already existed with gid %lu\n",
1736 fd->fd_grouplock.cg_gid);
1737 spin_unlock(&lli->lli_lock);
1740 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1741 spin_unlock(&lli->lli_lock);
1743 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1744 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1748 spin_lock(&lli->lli_lock);
1749 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1750 spin_unlock(&lli->lli_lock);
1751 CERROR("another thread just won the race\n");
1752 cl_put_grouplock(&grouplock);
1756 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1757 fd->fd_grouplock = grouplock;
1758 spin_unlock(&lli->lli_lock);
1760 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1764 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1766 struct ll_inode_info *lli = ll_i2info(inode);
1767 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1768 struct ccc_grouplock grouplock;
1771 spin_lock(&lli->lli_lock);
1772 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1773 spin_unlock(&lli->lli_lock);
1774 CWARN("no group lock held\n");
1777 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1779 if (fd->fd_grouplock.cg_gid != arg) {
1780 CWARN("group lock %lu doesn't match current id %lu\n",
1781 arg, fd->fd_grouplock.cg_gid);
1782 spin_unlock(&lli->lli_lock);
1786 grouplock = fd->fd_grouplock;
1787 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1788 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1789 spin_unlock(&lli->lli_lock);
1791 cl_put_grouplock(&grouplock);
1792 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1797 * Close inode open handle
1799 * \param dentry [in] dentry which contains the inode
1800 * \param it [in,out] intent which contains open info and result
1803 * \retval <0 failure
1805 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1807 struct inode *inode = dentry->d_inode;
1808 struct obd_client_handle *och;
1814 /* Root ? Do nothing. */
1815 if (dentry->d_inode->i_sb->s_root == dentry)
1818 /* No open handle to close? Move away */
1819 if (!it_disposition(it, DISP_OPEN_OPEN))
1822 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1824 OBD_ALLOC(och, sizeof(*och));
1826 GOTO(out, rc = -ENOMEM);
1828 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1830 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1833 /* this one is in place of ll_file_open */
1834 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1835 ptlrpc_req_finished(it->d.lustre.it_data);
1836 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1842 * Get size for inode for which FIEMAP mapping is requested.
1843 * Make the FIEMAP get_info call and returns the result.
1845 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1848 struct obd_export *exp = ll_i2dtexp(inode);
1849 struct lov_stripe_md *lsm = NULL;
1850 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1851 int vallen = num_bytes;
1855 /* Checks for fiemap flags */
1856 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1857 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1861 /* Check for FIEMAP_FLAG_SYNC */
1862 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1863 rc = filemap_fdatawrite(inode->i_mapping);
1868 lsm = ccc_inode_lsm_get(inode);
1872 /* If the stripe_count > 1 and the application does not understand
1873 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1875 if (lsm->lsm_stripe_count > 1 &&
1876 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1877 GOTO(out, rc = -EOPNOTSUPP);
1879 fm_key.oa.o_oi = lsm->lsm_oi;
1880 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1882 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1883 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1884 /* If filesize is 0, then there would be no objects for mapping */
1885 if (fm_key.oa.o_size == 0) {
1886 fiemap->fm_mapped_extents = 0;
1890 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1892 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1895 CERROR("obd_get_info failed: rc = %d\n", rc);
1898 ccc_inode_lsm_put(inode, lsm);
1902 int ll_fid2path(struct inode *inode, void *arg)
1904 struct obd_export *exp = ll_i2mdexp(inode);
1905 struct getinfo_fid2path *gfout, *gfin;
1909 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1910 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1913 /* Need to get the buflen */
1914 OBD_ALLOC_PTR(gfin);
1917 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1922 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1923 OBD_ALLOC(gfout, outsize);
1924 if (gfout == NULL) {
1928 memcpy(gfout, gfin, sizeof(*gfout));
1931 /* Call mdc_iocontrol */
1932 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1936 if (copy_to_user(arg, gfout, outsize))
1940 OBD_FREE(gfout, outsize);
1944 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1946 struct ll_user_fiemap *fiemap_s;
1947 size_t num_bytes, ret_bytes;
1948 unsigned int extent_count;
1951 /* Get the extent count so we can calculate the size of
1952 * required fiemap buffer */
1953 if (get_user(extent_count,
1954 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1956 num_bytes = sizeof(*fiemap_s) + (extent_count *
1957 sizeof(struct ll_fiemap_extent));
1959 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1960 if (fiemap_s == NULL)
1963 /* get the fiemap value */
1964 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1966 GOTO(error, rc = -EFAULT);
1968 /* If fm_extent_count is non-zero, read the first extent since
1969 * it is used to calculate end_offset and device from previous
1972 if (copy_from_user(&fiemap_s->fm_extents[0],
1973 (char __user *)arg + sizeof(*fiemap_s),
1974 sizeof(struct ll_fiemap_extent)))
1975 GOTO(error, rc = -EFAULT);
1978 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1982 ret_bytes = sizeof(struct ll_user_fiemap);
1984 if (extent_count != 0)
1985 ret_bytes += (fiemap_s->fm_mapped_extents *
1986 sizeof(struct ll_fiemap_extent));
1988 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1992 OBD_FREE_LARGE(fiemap_s, num_bytes);
1997 * Read the data_version for inode.
1999 * This value is computed using stripe object version on OST.
2000 * Version is computed using server side locking.
2002 * @param extent_lock Take extent lock. Not needed if a process is already
2003 * holding the OST object group locks.
2005 int ll_data_version(struct inode *inode, __u64 *data_version,
2008 struct lov_stripe_md *lsm = NULL;
2009 struct ll_sb_info *sbi = ll_i2sbi(inode);
2010 struct obdo *obdo = NULL;
2014 /* If no stripe, we consider version is 0. */
2015 lsm = ccc_inode_lsm_get(inode);
2016 if (!lsm_has_objects(lsm)) {
2018 CDEBUG(D_INODE, "No object for inode\n");
2022 OBD_ALLOC_PTR(obdo);
2024 GOTO(out, rc = -ENOMEM);
2026 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
2028 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2031 *data_version = obdo->o_data_version;
2037 ccc_inode_lsm_put(inode, lsm);
2041 struct ll_swap_stack {
2042 struct iattr ia1, ia2;
2044 struct inode *inode1, *inode2;
2045 bool check_dv1, check_dv2;
2048 static int ll_swap_layouts(struct file *file1, struct file *file2,
2049 struct lustre_swap_layouts *lsl)
2051 struct mdc_swap_layouts msl;
2052 struct md_op_data *op_data;
2055 struct ll_swap_stack *llss = NULL;
2058 OBD_ALLOC_PTR(llss);
2062 llss->inode1 = file1->f_dentry->d_inode;
2063 llss->inode2 = file2->f_dentry->d_inode;
2065 if (!S_ISREG(llss->inode2->i_mode))
2066 GOTO(free, rc = -EINVAL);
2068 if (inode_permission(llss->inode1, MAY_WRITE) ||
2069 inode_permission(llss->inode2, MAY_WRITE))
2070 GOTO(free, rc = -EPERM);
2072 if (llss->inode2->i_sb != llss->inode1->i_sb)
2073 GOTO(free, rc = -EXDEV);
2075 /* we use 2 bool because it is easier to swap than 2 bits */
2076 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2077 llss->check_dv1 = true;
2079 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2080 llss->check_dv2 = true;
2082 /* we cannot use lsl->sl_dvX directly because we may swap them */
2083 llss->dv1 = lsl->sl_dv1;
2084 llss->dv2 = lsl->sl_dv2;
2086 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2087 if (rc == 0) /* same file, done! */
2090 if (rc < 0) { /* sequentialize it */
2091 swap(llss->inode1, llss->inode2);
2093 swap(llss->dv1, llss->dv2);
2094 swap(llss->check_dv1, llss->check_dv2);
2098 if (gid != 0) { /* application asks to flush dirty cache */
2099 rc = ll_get_grouplock(llss->inode1, file1, gid);
2103 rc = ll_get_grouplock(llss->inode2, file2, gid);
2105 ll_put_grouplock(llss->inode1, file1, gid);
2110 /* to be able to restore mtime and atime after swap
2111 * we need to first save them */
2113 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2114 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2115 llss->ia1.ia_atime = llss->inode1->i_atime;
2116 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2117 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2118 llss->ia2.ia_atime = llss->inode2->i_atime;
2119 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2122 /* ultimate check, before swaping the layouts we check if
2123 * dataversion has changed (if requested) */
2124 if (llss->check_dv1) {
2125 rc = ll_data_version(llss->inode1, &dv, 0);
2128 if (dv != llss->dv1)
2129 GOTO(putgl, rc = -EAGAIN);
2132 if (llss->check_dv2) {
2133 rc = ll_data_version(llss->inode2, &dv, 0);
2136 if (dv != llss->dv2)
2137 GOTO(putgl, rc = -EAGAIN);
2140 /* struct md_op_data is used to send the swap args to the mdt
2141 * only flags is missing, so we use struct mdc_swap_layouts
2142 * through the md_op_data->op_data */
2143 /* flags from user space have to be converted before they are send to
2144 * server, no flag is sent today, they are only used on the client */
2147 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2148 0, LUSTRE_OPC_ANY, &msl);
2149 if (IS_ERR(op_data))
2150 GOTO(free, rc = PTR_ERR(op_data));
2152 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2153 sizeof(*op_data), op_data, NULL);
2154 ll_finish_md_op_data(op_data);
2158 ll_put_grouplock(llss->inode2, file2, gid);
2159 ll_put_grouplock(llss->inode1, file1, gid);
2162 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2166 /* clear useless flags */
2167 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2168 llss->ia1.ia_valid &= ~ATTR_MTIME;
2169 llss->ia2.ia_valid &= ~ATTR_MTIME;
2172 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2173 llss->ia1.ia_valid &= ~ATTR_ATIME;
2174 llss->ia2.ia_valid &= ~ATTR_ATIME;
2177 /* update time if requested */
2179 if (llss->ia2.ia_valid != 0) {
2180 mutex_lock(&llss->inode1->i_mutex);
2181 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2182 mutex_unlock(&llss->inode1->i_mutex);
2185 if (llss->ia1.ia_valid != 0) {
2188 mutex_lock(&llss->inode2->i_mutex);
2189 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2190 mutex_unlock(&llss->inode2->i_mutex);
2202 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2204 struct inode *inode = file->f_dentry->d_inode;
2205 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2209 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2210 inode->i_generation, inode, cmd);
2211 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2213 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2214 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2218 case LL_IOC_GETFLAGS:
2219 /* Get the current value of the file flags */
2220 return put_user(fd->fd_flags, (int *)arg);
2221 case LL_IOC_SETFLAGS:
2222 case LL_IOC_CLRFLAGS:
2223 /* Set or clear specific file flags */
2224 /* XXX This probably needs checks to ensure the flags are
2225 * not abused, and to handle any flag side effects.
2227 if (get_user(flags, (int *) arg))
2230 if (cmd == LL_IOC_SETFLAGS) {
2231 if ((flags & LL_FILE_IGNORE_LOCK) &&
2232 !(file->f_flags & O_DIRECT)) {
2233 CERROR("%s: unable to disable locking on "
2234 "non-O_DIRECT file\n", current->comm);
2238 fd->fd_flags |= flags;
2240 fd->fd_flags &= ~flags;
2243 case LL_IOC_LOV_SETSTRIPE:
2244 RETURN(ll_lov_setstripe(inode, file, arg));
2245 case LL_IOC_LOV_SETEA:
2246 RETURN(ll_lov_setea(inode, file, arg));
2247 case LL_IOC_LOV_SWAP_LAYOUTS: {
2249 struct lustre_swap_layouts lsl;
2251 if (copy_from_user(&lsl, (char *)arg,
2252 sizeof(struct lustre_swap_layouts)))
2255 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2258 file2 = fget(lsl.sl_fd);
2263 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2264 rc = ll_swap_layouts(file, file2, &lsl);
2268 case LL_IOC_LOV_GETSTRIPE:
2269 RETURN(ll_lov_getstripe(inode, arg));
2270 case LL_IOC_RECREATE_OBJ:
2271 RETURN(ll_lov_recreate_obj(inode, arg));
2272 case LL_IOC_RECREATE_FID:
2273 RETURN(ll_lov_recreate_fid(inode, arg));
2274 case FSFILT_IOC_FIEMAP:
2275 RETURN(ll_ioctl_fiemap(inode, arg));
2276 case FSFILT_IOC_GETFLAGS:
2277 case FSFILT_IOC_SETFLAGS:
2278 RETURN(ll_iocontrol(inode, file, cmd, arg));
2279 case FSFILT_IOC_GETVERSION_OLD:
2280 case FSFILT_IOC_GETVERSION:
2281 RETURN(put_user(inode->i_generation, (int *)arg));
2282 case LL_IOC_GROUP_LOCK:
2283 RETURN(ll_get_grouplock(inode, file, arg));
2284 case LL_IOC_GROUP_UNLOCK:
2285 RETURN(ll_put_grouplock(inode, file, arg));
2286 case IOC_OBD_STATFS:
2287 RETURN(ll_obd_statfs(inode, (void *)arg));
2289 /* We need to special case any other ioctls we want to handle,
2290 * to send them to the MDS/OST as appropriate and to properly
2291 * network encode the arg field.
2292 case FSFILT_IOC_SETVERSION_OLD:
2293 case FSFILT_IOC_SETVERSION:
2295 case LL_IOC_FLUSHCTX:
2296 RETURN(ll_flush_ctx(inode));
2297 case LL_IOC_PATH2FID: {
2298 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2299 sizeof(struct lu_fid)))
2304 case OBD_IOC_FID2PATH:
2305 RETURN(ll_fid2path(inode, (void *)arg));
2306 case LL_IOC_DATA_VERSION: {
2307 struct ioc_data_version idv;
2310 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2313 rc = ll_data_version(inode, &idv.idv_version,
2314 !(idv.idv_flags & LL_DV_NOFLUSH));
2316 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2322 case LL_IOC_GET_MDTIDX: {
2325 mdtidx = ll_get_mdt_idx(inode);
2329 if (put_user((int)mdtidx, (int*)arg))
2334 case OBD_IOC_GETDTNAME:
2335 case OBD_IOC_GETMDNAME:
2336 RETURN(ll_get_obd_name(inode, cmd, arg));
2337 case LL_IOC_HSM_STATE_GET: {
2338 struct md_op_data *op_data;
2339 struct hsm_user_state *hus;
2346 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2347 LUSTRE_OPC_ANY, hus);
2348 if (IS_ERR(op_data)) {
2350 RETURN(PTR_ERR(op_data));
2353 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2356 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2359 ll_finish_md_op_data(op_data);
2363 case LL_IOC_HSM_STATE_SET: {
2364 struct md_op_data *op_data;
2365 struct hsm_state_set *hss;
2371 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2376 /* Non-root users are forbidden to set or clear flags which are
2377 * NOT defined in HSM_USER_MASK. */
2378 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2379 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2384 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2385 LUSTRE_OPC_ANY, hss);
2386 if (IS_ERR(op_data)) {
2388 RETURN(PTR_ERR(op_data));
2391 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2394 ll_finish_md_op_data(op_data);
2399 case LL_IOC_HSM_ACTION: {
2400 struct md_op_data *op_data;
2401 struct hsm_current_action *hca;
2408 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2409 LUSTRE_OPC_ANY, hca);
2410 if (IS_ERR(op_data)) {
2412 RETURN(PTR_ERR(op_data));
2415 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2418 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2421 ll_finish_md_op_data(op_data);
2425 case LL_IOC_SET_LEASE: {
2426 struct ll_inode_info *lli = ll_i2info(inode);
2427 struct obd_client_handle *och = NULL;
2433 if (!(file->f_mode & FMODE_WRITE))
2438 if (!(file->f_mode & FMODE_READ))
2443 mutex_lock(&lli->lli_och_mutex);
2444 if (fd->fd_lease_och != NULL) {
2445 och = fd->fd_lease_och;
2446 fd->fd_lease_och = NULL;
2448 mutex_unlock(&lli->lli_och_mutex);
2451 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2452 rc = ll_lease_close(och, inode, &lease_broken);
2453 if (rc == 0 && lease_broken)
2459 /* return the type of lease or error */
2460 RETURN(rc < 0 ? rc : (int)mode);
2465 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2467 /* apply for lease */
2468 och = ll_lease_open(inode, file, mode);
2470 RETURN(PTR_ERR(och));
2473 mutex_lock(&lli->lli_och_mutex);
2474 if (fd->fd_lease_och == NULL) {
2475 fd->fd_lease_och = och;
2478 mutex_unlock(&lli->lli_och_mutex);
2480 /* impossible now that only excl is supported for now */
2481 ll_lease_close(och, inode, &lease_broken);
2486 case LL_IOC_GET_LEASE: {
2487 struct ll_inode_info *lli = ll_i2info(inode);
2488 struct ldlm_lock *lock = NULL;
2491 mutex_lock(&lli->lli_och_mutex);
2492 if (fd->fd_lease_och != NULL) {
2493 struct obd_client_handle *och = fd->fd_lease_och;
2495 lock = ldlm_handle2lock(&och->och_lease_handle);
2497 lock_res_and_lock(lock);
2498 if (!ldlm_is_cancel(lock))
2499 rc = och->och_flags &
2500 (FMODE_READ | FMODE_WRITE);
2501 unlock_res_and_lock(lock);
2502 ldlm_lock_put(lock);
2505 mutex_unlock(&lli->lli_och_mutex);
2513 ll_iocontrol_call(inode, file, cmd, arg, &err))
2516 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2522 #ifndef HAVE_FILE_LLSEEK_SIZE
2523 static inline loff_t
2524 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2526 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2528 if (offset > maxsize)
2531 if (offset != file->f_pos) {
2532 file->f_pos = offset;
2533 file->f_version = 0;
2539 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2540 loff_t maxsize, loff_t eof)
2542 struct inode *inode = file->f_dentry->d_inode;
2550 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2551 * position-querying operation. Avoid rewriting the "same"
2552 * f_pos value back to the file because a concurrent read(),
2553 * write() or lseek() might have altered it
2558 * f_lock protects against read/modify/write race with other
2559 * SEEK_CURs. Note that parallel writes and reads behave
2562 mutex_lock(&inode->i_mutex);
2563 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2564 mutex_unlock(&inode->i_mutex);
2568 * In the generic case the entire file is data, so as long as
2569 * offset isn't at the end of the file then the offset is data.
2576 * There is a virtual hole at the end of the file, so as long as
2577 * offset isn't i_size or larger, return i_size.
2585 return llseek_execute(file, offset, maxsize);
2589 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2591 struct inode *inode = file->f_dentry->d_inode;
2592 loff_t retval, eof = 0;
2595 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2596 (origin == SEEK_CUR) ? file->f_pos : 0);
2597 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2598 inode->i_ino, inode->i_generation, inode, retval, retval,
2600 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2602 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2603 retval = ll_glimpse_size(inode);
2606 eof = i_size_read(inode);
2609 retval = ll_generic_file_llseek_size(file, offset, origin,
2610 ll_file_maxbytes(inode), eof);
2614 int ll_flush(struct file *file, fl_owner_t id)
2616 struct inode *inode = file->f_dentry->d_inode;
2617 struct ll_inode_info *lli = ll_i2info(inode);
2618 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2621 LASSERT(!S_ISDIR(inode->i_mode));
2623 /* catch async errors that were recorded back when async writeback
2624 * failed for pages in this mapping. */
2625 rc = lli->lli_async_rc;
2626 lli->lli_async_rc = 0;
2627 err = lov_read_and_clear_async_rc(lli->lli_clob);
2631 /* The application has been told write failure already.
2632 * Do not report failure again. */
2633 if (fd->fd_write_failed)
2635 return rc ? -EIO : 0;
2639 * Called to make sure a portion of file has been written out.
2640 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2642 * Return how many pages have been written.
2644 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2645 enum cl_fsync_mode mode, int ignore_layout)
2647 struct cl_env_nest nest;
2650 struct obd_capa *capa = NULL;
2651 struct cl_fsync_io *fio;
2655 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2656 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2659 env = cl_env_nested_get(&nest);
2661 RETURN(PTR_ERR(env));
2663 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2665 io = ccc_env_thread_io(env);
2666 io->ci_obj = cl_i2info(inode)->lli_clob;
2667 io->ci_ignore_layout = ignore_layout;
2669 /* initialize parameters for sync */
2670 fio = &io->u.ci_fsync;
2671 fio->fi_capa = capa;
2672 fio->fi_start = start;
2674 fio->fi_fid = ll_inode2fid(inode);
2675 fio->fi_mode = mode;
2676 fio->fi_nr_written = 0;
2678 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2679 result = cl_io_loop(env, io);
2681 result = io->ci_result;
2683 result = fio->fi_nr_written;
2684 cl_io_fini(env, io);
2685 cl_env_nested_put(&nest, env);
2693 * When dentry is provided (the 'else' case), *file->f_dentry may be
2694 * null and dentry must be used directly rather than pulled from
2695 * *file->f_dentry as is done otherwise.
2698 #ifdef HAVE_FILE_FSYNC_4ARGS
2699 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2701 struct dentry *dentry = file->f_dentry;
2702 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2703 int ll_fsync(struct file *file, int datasync)
2705 struct dentry *dentry = file->f_dentry;
2707 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2710 struct inode *inode = dentry->d_inode;
2711 struct ll_inode_info *lli = ll_i2info(inode);
2712 struct ptlrpc_request *req;
2713 struct obd_capa *oc;
2717 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2718 inode->i_generation, inode);
2719 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2721 #ifdef HAVE_FILE_FSYNC_4ARGS
2722 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2723 mutex_lock(&inode->i_mutex);
2725 /* fsync's caller has already called _fdata{sync,write}, we want
2726 * that IO to finish before calling the osc and mdc sync methods */
2727 rc = filemap_fdatawait(inode->i_mapping);
2730 /* catch async errors that were recorded back when async writeback
2731 * failed for pages in this mapping. */
2732 if (!S_ISDIR(inode->i_mode)) {
2733 err = lli->lli_async_rc;
2734 lli->lli_async_rc = 0;
2737 err = lov_read_and_clear_async_rc(lli->lli_clob);
2742 oc = ll_mdscapa_get(inode);
2743 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2749 ptlrpc_req_finished(req);
2751 if (datasync && S_ISREG(inode->i_mode)) {
2752 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2754 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2756 if (rc == 0 && err < 0)
2759 fd->fd_write_failed = true;
2761 fd->fd_write_failed = false;
2764 #ifdef HAVE_FILE_FSYNC_4ARGS
2765 mutex_unlock(&inode->i_mutex);
2770 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2772 struct inode *inode = file->f_dentry->d_inode;
2773 struct ll_sb_info *sbi = ll_i2sbi(inode);
2774 struct ldlm_enqueue_info einfo = {
2775 .ei_type = LDLM_FLOCK,
2776 .ei_cb_cp = ldlm_flock_completion_ast,
2777 .ei_cbdata = file_lock,
2779 struct md_op_data *op_data;
2780 struct lustre_handle lockh = {0};
2781 ldlm_policy_data_t flock = {{0}};
2787 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2788 inode->i_ino, file_lock);
2790 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2792 if (file_lock->fl_flags & FL_FLOCK) {
2793 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2794 /* flocks are whole-file locks */
2795 flock.l_flock.end = OFFSET_MAX;
2796 /* For flocks owner is determined by the local file desctiptor*/
2797 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2798 } else if (file_lock->fl_flags & FL_POSIX) {
2799 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2800 flock.l_flock.start = file_lock->fl_start;
2801 flock.l_flock.end = file_lock->fl_end;
2805 flock.l_flock.pid = file_lock->fl_pid;
2807 /* Somewhat ugly workaround for svc lockd.
2808 * lockd installs custom fl_lmops->lm_compare_owner that checks
2809 * for the fl_owner to be the same (which it always is on local node
2810 * I guess between lockd processes) and then compares pid.
2811 * As such we assign pid to the owner field to make it all work,
2812 * conflict with normal locks is unlikely since pid space and
2813 * pointer space for current->files are not intersecting */
2814 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2815 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2817 switch (file_lock->fl_type) {
2819 einfo.ei_mode = LCK_PR;
2822 /* An unlock request may or may not have any relation to
2823 * existing locks so we may not be able to pass a lock handle
2824 * via a normal ldlm_lock_cancel() request. The request may even
2825 * unlock a byte range in the middle of an existing lock. In
2826 * order to process an unlock request we need all of the same
2827 * information that is given with a normal read or write record
2828 * lock request. To avoid creating another ldlm unlock (cancel)
2829 * message we'll treat a LCK_NL flock request as an unlock. */
2830 einfo.ei_mode = LCK_NL;
2833 einfo.ei_mode = LCK_PW;
2836 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2837 file_lock->fl_type);
2852 flags = LDLM_FL_BLOCK_NOWAIT;
2858 flags = LDLM_FL_TEST_LOCK;
2859 /* Save the old mode so that if the mode in the lock changes we
2860 * can decrement the appropriate reader or writer refcount. */
2861 file_lock->fl_type = einfo.ei_mode;
2864 CERROR("unknown fcntl lock command: %d\n", cmd);
2868 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2869 LUSTRE_OPC_ANY, NULL);
2870 if (IS_ERR(op_data))
2871 RETURN(PTR_ERR(op_data));
2873 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2874 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2875 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2877 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2878 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2880 if ((file_lock->fl_flags & FL_FLOCK) &&
2881 (rc == 0 || file_lock->fl_type == F_UNLCK))
2882 rc2 = flock_lock_file_wait(file, file_lock);
2883 if ((file_lock->fl_flags & FL_POSIX) &&
2884 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2885 !(flags & LDLM_FL_TEST_LOCK))
2886 rc2 = posix_lock_file_wait(file, file_lock);
2888 if (rc2 && file_lock->fl_type != F_UNLCK) {
2889 einfo.ei_mode = LCK_NL;
2890 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2891 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2895 ll_finish_md_op_data(op_data);
2900 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2908 * test if some locks matching bits and l_req_mode are acquired
2909 * - bits can be in different locks
2910 * - if found clear the common lock bits in *bits
2911 * - the bits not found, are kept in *bits
2913 * \param bits [IN] searched lock bits [IN]
2914 * \param l_req_mode [IN] searched lock mode
2915 * \retval boolean, true iff all bits are found
2917 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2919 struct lustre_handle lockh;
2920 ldlm_policy_data_t policy;
2921 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2922 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2931 fid = &ll_i2info(inode)->lli_fid;
2932 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2933 ldlm_lockname[mode]);
2935 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2936 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2937 policy.l_inodebits.bits = *bits & (1 << i);
2938 if (policy.l_inodebits.bits == 0)
2941 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2942 &policy, mode, &lockh)) {
2943 struct ldlm_lock *lock;
2945 lock = ldlm_handle2lock(&lockh);
2948 ~(lock->l_policy_data.l_inodebits.bits);
2949 LDLM_LOCK_PUT(lock);
2951 *bits &= ~policy.l_inodebits.bits;
2958 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2959 struct lustre_handle *lockh, __u64 flags)
2961 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2966 fid = &ll_i2info(inode)->lli_fid;
2967 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2969 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2970 fid, LDLM_IBITS, &policy,
2971 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2975 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2977 /* Already unlinked. Just update nlink and return success */
2978 if (rc == -ENOENT) {
2980 /* This path cannot be hit for regular files unless in
2981 * case of obscure races, so no need to to validate
2983 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2985 } else if (rc != 0) {
2986 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2987 ll_get_fsname(inode->i_sb, NULL, 0),
2988 PFID(ll_inode2fid(inode)), rc);
2994 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2997 struct inode *inode = dentry->d_inode;
2998 struct ptlrpc_request *req = NULL;
2999 struct obd_export *exp;
3003 LASSERT(inode != NULL);
3005 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
3006 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
3008 exp = ll_i2mdexp(inode);
3010 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3011 * But under CMD case, it caused some lock issues, should be fixed
3012 * with new CMD ibits lock. See bug 12718 */
3013 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3014 struct lookup_intent oit = { .it_op = IT_GETATTR };
3015 struct md_op_data *op_data;
3017 if (ibits == MDS_INODELOCK_LOOKUP)
3018 oit.it_op = IT_LOOKUP;
3020 /* Call getattr by fid, so do not provide name at all. */
3021 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
3022 dentry->d_inode, NULL, 0, 0,
3023 LUSTRE_OPC_ANY, NULL);
3024 if (IS_ERR(op_data))
3025 RETURN(PTR_ERR(op_data));
3027 oit.it_create_mode |= M_CHECK_STALE;
3028 rc = md_intent_lock(exp, op_data, NULL, 0,
3029 /* we are not interested in name
3032 ll_md_blocking_ast, 0);
3033 ll_finish_md_op_data(op_data);
3034 oit.it_create_mode &= ~M_CHECK_STALE;
3036 rc = ll_inode_revalidate_fini(inode, rc);
3040 rc = ll_revalidate_it_finish(req, &oit, dentry);
3042 ll_intent_release(&oit);
3046 /* Unlinked? Unhash dentry, so it is not picked up later by
3047 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3048 here to preserve get_cwd functionality on 2.6.
3050 if (!dentry->d_inode->i_nlink)
3051 d_lustre_invalidate(dentry, 0);
3053 ll_lookup_finish_locks(&oit, dentry);
3054 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3055 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3056 obd_valid valid = OBD_MD_FLGETATTR;
3057 struct md_op_data *op_data;
3060 if (S_ISREG(inode->i_mode)) {
3061 rc = ll_get_max_mdsize(sbi, &ealen);
3064 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3067 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3068 0, ealen, LUSTRE_OPC_ANY,
3070 if (IS_ERR(op_data))
3071 RETURN(PTR_ERR(op_data));
3073 op_data->op_valid = valid;
3074 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3075 * capa for this inode. Because we only keep capas of dirs
3077 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3078 ll_finish_md_op_data(op_data);
3080 rc = ll_inode_revalidate_fini(inode, rc);
3084 rc = ll_prep_inode(&inode, req, NULL, NULL);
3087 ptlrpc_req_finished(req);
3091 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
3094 struct inode *inode = dentry->d_inode;
3098 rc = __ll_inode_revalidate_it(dentry, it, ibits);
3102 /* if object isn't regular file, don't validate size */
3103 if (!S_ISREG(inode->i_mode)) {
3104 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3105 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3106 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3108 /* In case of restore, the MDT has the right size and has
3109 * already send it back without granting the layout lock,
3110 * inode is up-to-date so glimpse is useless.
3111 * Also to glimpse we need the layout, in case of a running
3112 * restore the MDT holds the layout lock so the glimpse will
3113 * block up to the end of restore (getattr will block)
3115 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3116 rc = ll_glimpse_size(inode);
3121 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3122 struct lookup_intent *it, struct kstat *stat)
3124 struct inode *inode = de->d_inode;
3125 struct ll_sb_info *sbi = ll_i2sbi(inode);
3126 struct ll_inode_info *lli = ll_i2info(inode);
3129 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3130 MDS_INODELOCK_LOOKUP);
3131 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3136 stat->dev = inode->i_sb->s_dev;
3137 if (ll_need_32bit_api(sbi))
3138 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3140 stat->ino = inode->i_ino;
3141 stat->mode = inode->i_mode;
3142 stat->nlink = inode->i_nlink;
3143 stat->uid = inode->i_uid;
3144 stat->gid = inode->i_gid;
3145 stat->rdev = inode->i_rdev;
3146 stat->atime = inode->i_atime;
3147 stat->mtime = inode->i_mtime;
3148 stat->ctime = inode->i_ctime;
3149 stat->blksize = 1 << inode->i_blkbits;
3151 stat->size = i_size_read(inode);
3152 stat->blocks = inode->i_blocks;
3156 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3158 struct lookup_intent it = { .it_op = IT_GETATTR };
3160 return ll_getattr_it(mnt, de, &it, stat);
3163 #ifdef HAVE_LINUX_FIEMAP_H
3164 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3165 __u64 start, __u64 len)
3169 struct ll_user_fiemap *fiemap;
3170 unsigned int extent_count = fieinfo->fi_extents_max;
3172 num_bytes = sizeof(*fiemap) + (extent_count *
3173 sizeof(struct ll_fiemap_extent));
3174 OBD_ALLOC_LARGE(fiemap, num_bytes);
3179 fiemap->fm_flags = fieinfo->fi_flags;
3180 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3181 fiemap->fm_start = start;
3182 fiemap->fm_length = len;
3183 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3184 sizeof(struct ll_fiemap_extent));
3186 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3188 fieinfo->fi_flags = fiemap->fm_flags;
3189 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3190 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3191 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3193 OBD_FREE_LARGE(fiemap, num_bytes);
3198 struct posix_acl * ll_get_acl(struct inode *inode, int type)
3200 struct ll_inode_info *lli = ll_i2info(inode);
3201 struct posix_acl *acl = NULL;
3204 spin_lock(&lli->lli_lock);
3205 /* VFS' acl_permission_check->check_acl will release the refcount */
3206 acl = posix_acl_dup(lli->lli_posix_acl);
3207 spin_unlock(&lli->lli_lock);
3212 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3214 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3215 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3217 ll_check_acl(struct inode *inode, int mask)
3220 # ifdef CONFIG_FS_POSIX_ACL
3221 struct posix_acl *acl;
3225 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3226 if (flags & IPERM_FLAG_RCU)
3229 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3234 rc = posix_acl_permission(inode, acl, mask);
3235 posix_acl_release(acl);
3238 # else /* !CONFIG_FS_POSIX_ACL */
3240 # endif /* CONFIG_FS_POSIX_ACL */
3242 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3244 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3245 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3247 # ifdef HAVE_INODE_PERMISION_2ARGS
3248 int ll_inode_permission(struct inode *inode, int mask)
3250 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3257 #ifdef MAY_NOT_BLOCK
3258 if (mask & MAY_NOT_BLOCK)
3260 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3261 if (flags & IPERM_FLAG_RCU)
3265 /* as root inode are NOT getting validated in lookup operation,
3266 * need to do it before permission check. */
3268 if (inode == inode->i_sb->s_root->d_inode) {
3269 struct lookup_intent it = { .it_op = IT_LOOKUP };
3271 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3272 MDS_INODELOCK_LOOKUP);
3277 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3278 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3280 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3281 return lustre_check_remote_perm(inode, mask);
3283 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3284 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3289 #ifdef HAVE_FILE_READV
3290 #define READ_METHOD readv
3291 #define READ_FUNCTION ll_file_readv
3292 #define WRITE_METHOD writev
3293 #define WRITE_FUNCTION ll_file_writev
3295 #define READ_METHOD aio_read
3296 #define READ_FUNCTION ll_file_aio_read
3297 #define WRITE_METHOD aio_write
3298 #define WRITE_FUNCTION ll_file_aio_write
3301 /* -o localflock - only provides locally consistent flock locks */
3302 struct file_operations ll_file_operations = {
3303 .read = ll_file_read,
3304 .READ_METHOD = READ_FUNCTION,
3305 .write = ll_file_write,
3306 .WRITE_METHOD = WRITE_FUNCTION,
3307 .unlocked_ioctl = ll_file_ioctl,
3308 .open = ll_file_open,
3309 .release = ll_file_release,
3310 .mmap = ll_file_mmap,
3311 .llseek = ll_file_seek,
3312 .splice_read = ll_file_splice_read,
3317 struct file_operations ll_file_operations_flock = {
3318 .read = ll_file_read,
3319 .READ_METHOD = READ_FUNCTION,
3320 .write = ll_file_write,
3321 .WRITE_METHOD = WRITE_FUNCTION,
3322 .unlocked_ioctl = ll_file_ioctl,
3323 .open = ll_file_open,
3324 .release = ll_file_release,
3325 .mmap = ll_file_mmap,
3326 .llseek = ll_file_seek,
3327 .splice_read = ll_file_splice_read,
3330 .flock = ll_file_flock,
3331 .lock = ll_file_flock
3334 /* These are for -o noflock - to return ENOSYS on flock calls */
3335 struct file_operations ll_file_operations_noflock = {
3336 .read = ll_file_read,
3337 .READ_METHOD = READ_FUNCTION,
3338 .write = ll_file_write,
3339 .WRITE_METHOD = WRITE_FUNCTION,
3340 .unlocked_ioctl = ll_file_ioctl,
3341 .open = ll_file_open,
3342 .release = ll_file_release,
3343 .mmap = ll_file_mmap,
3344 .llseek = ll_file_seek,
3345 .splice_read = ll_file_splice_read,
3348 .flock = ll_file_noflock,
3349 .lock = ll_file_noflock
3352 struct inode_operations ll_file_inode_operations = {
3353 .setattr = ll_setattr,
3354 .getattr = ll_getattr,
3355 .permission = ll_inode_permission,
3356 .setxattr = ll_setxattr,
3357 .getxattr = ll_getxattr,
3358 .listxattr = ll_listxattr,
3359 .removexattr = ll_removexattr,
3360 #ifdef HAVE_LINUX_FIEMAP_H
3361 .fiemap = ll_fiemap,
3363 #ifdef HAVE_IOP_GET_ACL
3364 .get_acl = ll_get_acl,
3368 /* dynamic ioctl number support routins */
3369 static struct llioc_ctl_data {
3370 struct rw_semaphore ioc_sem;
3371 cfs_list_t ioc_head;
3373 __RWSEM_INITIALIZER(llioc.ioc_sem),
3374 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3379 cfs_list_t iocd_list;
3380 unsigned int iocd_size;
3381 llioc_callback_t iocd_cb;
3382 unsigned int iocd_count;
3383 unsigned int iocd_cmd[0];
3386 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3389 struct llioc_data *in_data = NULL;
3392 if (cb == NULL || cmd == NULL ||
3393 count > LLIOC_MAX_CMD || count < 0)
3396 size = sizeof(*in_data) + count * sizeof(unsigned int);
3397 OBD_ALLOC(in_data, size);
3398 if (in_data == NULL)
3401 memset(in_data, 0, sizeof(*in_data));
3402 in_data->iocd_size = size;
3403 in_data->iocd_cb = cb;
3404 in_data->iocd_count = count;
3405 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3407 down_write(&llioc.ioc_sem);
3408 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3409 up_write(&llioc.ioc_sem);
3414 void ll_iocontrol_unregister(void *magic)
3416 struct llioc_data *tmp;
3421 down_write(&llioc.ioc_sem);
3422 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3424 unsigned int size = tmp->iocd_size;
3426 cfs_list_del(&tmp->iocd_list);
3427 up_write(&llioc.ioc_sem);
3429 OBD_FREE(tmp, size);
3433 up_write(&llioc.ioc_sem);
3435 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3438 EXPORT_SYMBOL(ll_iocontrol_register);
3439 EXPORT_SYMBOL(ll_iocontrol_unregister);
3441 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3442 unsigned int cmd, unsigned long arg, int *rcp)
3444 enum llioc_iter ret = LLIOC_CONT;
3445 struct llioc_data *data;
3446 int rc = -EINVAL, i;
3448 down_read(&llioc.ioc_sem);
3449 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3450 for (i = 0; i < data->iocd_count; i++) {
3451 if (cmd != data->iocd_cmd[i])
3454 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3458 if (ret == LLIOC_STOP)
3461 up_read(&llioc.ioc_sem);
3468 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3470 struct ll_inode_info *lli = ll_i2info(inode);
3471 struct cl_env_nest nest;
3476 if (lli->lli_clob == NULL)
3479 env = cl_env_nested_get(&nest);
3481 RETURN(PTR_ERR(env));
3483 result = cl_conf_set(env, lli->lli_clob, conf);
3484 cl_env_nested_put(&nest, env);
3486 if (conf->coc_opc == OBJECT_CONF_SET) {
3487 struct ldlm_lock *lock = conf->coc_lock;
3489 LASSERT(lock != NULL);
3490 LASSERT(ldlm_has_layout(lock));
3492 /* it can only be allowed to match after layout is
3493 * applied to inode otherwise false layout would be
3494 * seen. Applying layout shoud happen before dropping
3495 * the intent lock. */
3496 ldlm_lock_allow_match(lock);
3502 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3503 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3506 struct ll_sb_info *sbi = ll_i2sbi(inode);
3507 struct obd_capa *oc;
3508 struct ptlrpc_request *req;
3509 struct mdt_body *body;
3516 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3517 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3518 lock->l_lvb_data, lock->l_lvb_len);
3520 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3523 /* if layout lock was granted right away, the layout is returned
3524 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3525 * blocked and then granted via completion ast, we have to fetch
3526 * layout here. Please note that we can't use the LVB buffer in
3527 * completion AST because it doesn't have a large enough buffer */
3528 oc = ll_mdscapa_get(inode);
3529 rc = ll_get_max_mdsize(sbi, &lmmsize);
3531 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3532 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3538 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3539 if (body == NULL || body->eadatasize > lmmsize)
3540 GOTO(out, rc = -EPROTO);
3542 lmmsize = body->eadatasize;
3543 if (lmmsize == 0) /* empty layout */
3546 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3548 GOTO(out, rc = -EFAULT);
3550 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3551 if (lvbdata == NULL)
3552 GOTO(out, rc = -ENOMEM);
3554 memcpy(lvbdata, lmm, lmmsize);
3555 lock_res_and_lock(lock);
3556 if (lock->l_lvb_data != NULL)
3557 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3559 lock->l_lvb_data = lvbdata;
3560 lock->l_lvb_len = lmmsize;
3561 unlock_res_and_lock(lock);
3566 ptlrpc_req_finished(req);
3571 * Apply the layout to the inode. Layout lock is held and will be released
3574 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3575 struct inode *inode, __u32 *gen, bool reconf)
3577 struct ll_inode_info *lli = ll_i2info(inode);
3578 struct ll_sb_info *sbi = ll_i2sbi(inode);
3579 struct ldlm_lock *lock;
3580 struct lustre_md md = { NULL };
3581 struct cl_object_conf conf;
3584 bool wait_layout = false;
3587 LASSERT(lustre_handle_is_used(lockh));
3589 lock = ldlm_handle2lock(lockh);
3590 LASSERT(lock != NULL);
3591 LASSERT(ldlm_has_layout(lock));
3593 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3594 inode, PFID(&lli->lli_fid), reconf);
3596 /* in case this is a caching lock and reinstate with new inode */
3597 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3599 lock_res_and_lock(lock);
3600 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3601 unlock_res_and_lock(lock);
3602 /* checking lvb_ready is racy but this is okay. The worst case is
3603 * that multi processes may configure the file on the same time. */
3605 if (lvb_ready || !reconf) {
3608 /* layout_gen must be valid if layout lock is not
3609 * cancelled and stripe has already set */
3610 *gen = lli->lli_layout_gen;
3616 rc = ll_layout_fetch(inode, lock);
3620 /* for layout lock, lmm is returned in lock's lvb.
3621 * lvb_data is immutable if the lock is held so it's safe to access it
3622 * without res lock. See the description in ldlm_lock_decref_internal()
3623 * for the condition to free lvb_data of layout lock */
3624 if (lock->l_lvb_data != NULL) {
3625 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3626 lock->l_lvb_data, lock->l_lvb_len);
3628 *gen = LL_LAYOUT_GEN_EMPTY;
3630 *gen = md.lsm->lsm_layout_gen;
3633 CERROR("%s: file "DFID" unpackmd error: %d\n",
3634 ll_get_fsname(inode->i_sb, NULL, 0),
3635 PFID(&lli->lli_fid), rc);
3641 /* set layout to file. Unlikely this will fail as old layout was
3642 * surely eliminated */
3643 memset(&conf, 0, sizeof conf);
3644 conf.coc_opc = OBJECT_CONF_SET;
3645 conf.coc_inode = inode;
3646 conf.coc_lock = lock;
3647 conf.u.coc_md = &md;
3648 rc = ll_layout_conf(inode, &conf);
3651 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3653 /* refresh layout failed, need to wait */
3654 wait_layout = rc == -EBUSY;
3658 LDLM_LOCK_PUT(lock);
3659 ldlm_lock_decref(lockh, mode);
3661 /* wait for IO to complete if it's still being used. */
3663 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3664 ll_get_fsname(inode->i_sb, NULL, 0),
3665 inode, PFID(&lli->lli_fid));
3667 memset(&conf, 0, sizeof conf);
3668 conf.coc_opc = OBJECT_CONF_WAIT;
3669 conf.coc_inode = inode;
3670 rc = ll_layout_conf(inode, &conf);
3674 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3675 PFID(&lli->lli_fid), rc);
3681 * This function checks if there exists a LAYOUT lock on the client side,
3682 * or enqueues it if it doesn't have one in cache.
3684 * This function will not hold layout lock so it may be revoked any time after
3685 * this function returns. Any operations depend on layout should be redone
3688 * This function should be called before lov_io_init() to get an uptodate
3689 * layout version, the caller should save the version number and after IO
3690 * is finished, this function should be called again to verify that layout
3691 * is not changed during IO time.
3693 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3695 struct ll_inode_info *lli = ll_i2info(inode);
3696 struct ll_sb_info *sbi = ll_i2sbi(inode);
3697 struct md_op_data *op_data;
3698 struct lookup_intent it;
3699 struct lustre_handle lockh;
3701 struct ldlm_enqueue_info einfo = {
3702 .ei_type = LDLM_IBITS,
3704 .ei_cb_bl = ll_md_blocking_ast,
3705 .ei_cb_cp = ldlm_completion_ast,
3710 *gen = lli->lli_layout_gen;
3711 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3715 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3716 LASSERT(S_ISREG(inode->i_mode));
3718 /* mostly layout lock is caching on the local side, so try to match
3719 * it before grabbing layout lock mutex. */
3720 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3721 if (mode != 0) { /* hit cached lock */
3722 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3726 /* better hold lli_layout_mutex to try again otherwise
3727 * it will have starvation problem. */
3730 /* take layout lock mutex to enqueue layout lock exclusively. */
3731 mutex_lock(&lli->lli_layout_mutex);
3734 /* try again. Maybe somebody else has done this. */
3735 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3736 if (mode != 0) { /* hit cached lock */
3737 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3741 mutex_unlock(&lli->lli_layout_mutex);
3745 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3746 0, 0, LUSTRE_OPC_ANY, NULL);
3747 if (IS_ERR(op_data)) {
3748 mutex_unlock(&lli->lli_layout_mutex);
3749 RETURN(PTR_ERR(op_data));
3752 /* have to enqueue one */
3753 memset(&it, 0, sizeof(it));
3754 it.it_op = IT_LAYOUT;
3755 lockh.cookie = 0ULL;
3757 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3758 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3759 PFID(&lli->lli_fid));
3761 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3763 if (it.d.lustre.it_data != NULL)
3764 ptlrpc_req_finished(it.d.lustre.it_data);
3765 it.d.lustre.it_data = NULL;
3767 ll_finish_md_op_data(op_data);
3769 mode = it.d.lustre.it_lock_mode;
3770 it.d.lustre.it_lock_mode = 0;
3771 ll_intent_drop_lock(&it);
3774 /* set lock data in case this is a new lock */
3775 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3776 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3780 mutex_unlock(&lli->lli_layout_mutex);
3786 * This function send a restore request to the MDT
3788 int ll_layout_restore(struct inode *inode)
3790 struct hsm_user_request *hur;
3794 len = sizeof(struct hsm_user_request) +
3795 sizeof(struct hsm_user_item);
3796 OBD_ALLOC(hur, len);
3800 hur->hur_request.hr_action = HUA_RESTORE;
3801 hur->hur_request.hr_archive_id = 0;
3802 hur->hur_request.hr_flags = 0;
3803 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3804 sizeof(hur->hur_user_item[0].hui_fid));
3805 hur->hur_user_item[0].hui_extent.length = -1;
3806 hur->hur_request.hr_itemcount = 1;
3807 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,