4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och)
125 struct obd_export *exp = ll_i2mdexp(inode);
126 struct md_op_data *op_data;
127 struct ptlrpc_request *req = NULL;
128 struct obd_device *obd = class_exp2obd(exp);
135 * XXX: in case of LMV, is this correct to access
138 CERROR("Invalid MDC connection handle "LPX64"\n",
139 ll_i2mdexp(inode)->exp_handle.h_cookie);
143 OBD_ALLOC_PTR(op_data);
145 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
147 ll_prepare_close(inode, op_data, och);
148 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
149 rc = md_close(md_exp, op_data, och->och_mod, &req);
151 /* This close must have the epoch closed. */
152 LASSERT(epoch_close);
153 /* MDS has instructed us to obtain Size-on-MDS attribute from
154 * OSTs and send setattr to back to MDS. */
155 rc = ll_som_update(inode, op_data);
157 CERROR("inode %lu mdc Size-on-MDS update failed: "
158 "rc = %d\n", inode->i_ino, rc);
162 CERROR("inode %lu mdc close failed: rc = %d\n",
166 /* DATA_MODIFIED flag was successfully sent on close, cancel data
167 * modification flag. */
168 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
169 struct ll_inode_info *lli = ll_i2info(inode);
171 spin_lock(&lli->lli_lock);
172 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
173 spin_unlock(&lli->lli_lock);
176 ll_finish_md_op_data(op_data);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
188 if (exp_connect_som(exp) && !epoch_close &&
189 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
190 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
192 md_clear_open_replay_data(md_exp, och);
193 /* Free @och if it is not waiting for DONE_WRITING. */
194 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
197 if (req) /* This is close request */
198 ptlrpc_req_finished(req);
202 int ll_md_real_close(struct inode *inode, int flags)
204 struct ll_inode_info *lli = ll_i2info(inode);
205 struct obd_client_handle **och_p;
206 struct obd_client_handle *och;
211 if (flags & FMODE_WRITE) {
212 och_p = &lli->lli_mds_write_och;
213 och_usecount = &lli->lli_open_fd_write_count;
214 } else if (flags & FMODE_EXEC) {
215 och_p = &lli->lli_mds_exec_och;
216 och_usecount = &lli->lli_open_fd_exec_count;
218 LASSERT(flags & FMODE_READ);
219 och_p = &lli->lli_mds_read_och;
220 och_usecount = &lli->lli_open_fd_read_count;
223 mutex_lock(&lli->lli_och_mutex);
224 if (*och_usecount) { /* There are still users of this handle, so
226 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
233 if (och) { /* There might be a race and somebody have freed this och
235 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
242 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
254 if (fd->fd_lease_och != NULL) {
257 /* Usually the lease is not released when the
258 * application crashed, we need to release here. */
259 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
260 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
261 PFID(&lli->lli_fid), rc, lease_broken);
263 fd->fd_lease_och = NULL;
266 if (fd->fd_och != NULL) {
267 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och);
272 /* Let's see if we have good enough OPEN lock on the file and if
273 we can skip talking to MDS */
274 if (file->f_dentry->d_inode) { /* Can this ever be false? */
276 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
277 struct lustre_handle lockh;
278 struct inode *inode = file->f_dentry->d_inode;
279 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
281 mutex_lock(&lli->lli_och_mutex);
282 if (fd->fd_omode & FMODE_WRITE) {
284 LASSERT(lli->lli_open_fd_write_count);
285 lli->lli_open_fd_write_count--;
286 } else if (fd->fd_omode & FMODE_EXEC) {
288 LASSERT(lli->lli_open_fd_exec_count);
289 lli->lli_open_fd_exec_count--;
292 LASSERT(lli->lli_open_fd_read_count);
293 lli->lli_open_fd_read_count--;
295 mutex_unlock(&lli->lli_och_mutex);
297 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
298 LDLM_IBITS, &policy, lockmode,
300 rc = ll_md_real_close(file->f_dentry->d_inode,
304 CERROR("Releasing a file %p with negative dentry %p. Name %s",
305 file, file->f_dentry, file->f_dentry->d_name.name);
309 LUSTRE_FPRIVATE(file) = NULL;
310 ll_file_data_put(fd);
311 ll_capa_close(inode);
316 /* While this returns an error code, fput() the caller does not, so we need
317 * to make every effort to clean up all of our state here. Also, applications
318 * rarely check close errors and even if an error is returned they will not
319 * re-try the close call.
321 int ll_file_release(struct inode *inode, struct file *file)
323 struct ll_file_data *fd;
324 struct ll_sb_info *sbi = ll_i2sbi(inode);
325 struct ll_inode_info *lli = ll_i2info(inode);
329 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
330 inode->i_generation, inode);
332 #ifdef CONFIG_FS_POSIX_ACL
333 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
334 inode == inode->i_sb->s_root->d_inode) {
335 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
338 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
339 fd->fd_flags &= ~LL_FILE_RMTACL;
340 rct_del(&sbi->ll_rct, cfs_curproc_pid());
341 et_search_free(&sbi->ll_et, cfs_curproc_pid());
346 if (inode->i_sb->s_root != file->f_dentry)
347 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
348 fd = LUSTRE_FPRIVATE(file);
351 /* The last ref on @file, maybe not the the owner pid of statahead.
352 * Different processes can open the same dir, "ll_opendir_key" means:
353 * it is me that should stop the statahead thread. */
354 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
355 lli->lli_opendir_pid != 0)
356 ll_stop_statahead(inode, lli->lli_opendir_key);
358 if (inode->i_sb->s_root == file->f_dentry) {
359 LUSTRE_FPRIVATE(file) = NULL;
360 ll_file_data_put(fd);
364 if (!S_ISDIR(inode->i_mode)) {
365 lov_read_and_clear_async_rc(lli->lli_clob);
366 lli->lli_async_rc = 0;
369 rc = ll_md_close(sbi->ll_md_exp, inode, file);
371 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
372 libcfs_debug_dumplog();
377 static int ll_intent_file_open(struct file *file, void *lmm,
378 int lmmsize, struct lookup_intent *itp)
380 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
381 struct dentry *parent = file->f_dentry->d_parent;
382 struct md_op_data *op_data;
383 struct ptlrpc_request *req;
384 __u32 opc = LUSTRE_OPC_ANY;
391 /* Usually we come here only for NFSD, and we want open lock.
392 But we can also get here with pre 2.6.15 patchless kernels, and in
393 that case that lock is also ok */
394 /* We can also get here if there was cached open handle in revalidate_it
395 * but it disappeared while we were getting from there to ll_file_open.
396 * But this means this file was closed and immediatelly opened which
397 * makes a good candidate for using OPEN lock */
398 /* If lmmsize & lmm are not 0, we are just setting stripe info
399 * parameters. No need for the open lock */
400 if (lmm == NULL && lmmsize == 0) {
401 itp->it_flags |= MDS_OPEN_LOCK;
402 if (itp->it_flags & FMODE_WRITE)
403 opc = LUSTRE_OPC_CREATE;
406 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
407 file->f_dentry->d_inode, NULL, 0,
411 RETURN(PTR_ERR(op_data));
413 itp->it_flags |= MDS_OPEN_BY_FID;
414 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
415 0 /*unused */, &req, ll_md_blocking_ast, 0);
416 ll_finish_md_op_data(op_data);
418 /* reason for keep own exit path - don`t flood log
419 * with messages with -ESTALE errors.
421 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
422 it_open_error(DISP_OPEN_OPEN, itp))
424 ll_release_openhandle(file->f_dentry, itp);
428 if (it_disposition(itp, DISP_LOOKUP_NEG))
429 GOTO(out, rc = -ENOENT);
431 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
432 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
433 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
437 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
438 if (!rc && itp->d.lustre.it_lock_mode)
439 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
443 ptlrpc_req_finished(itp->d.lustre.it_data);
444 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
445 ll_intent_drop_lock(itp);
451 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
452 * not believe attributes if a few ioepoch holders exist. Attributes for
453 * previous ioepoch if new one is opened are also skipped by MDS.
455 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
457 if (ioepoch && lli->lli_ioepoch != ioepoch) {
458 lli->lli_ioepoch = ioepoch;
459 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
460 ioepoch, PFID(&lli->lli_fid));
464 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
465 struct obd_client_handle *och)
467 struct ptlrpc_request *req = it->d.lustre.it_data;
468 struct mdt_body *body;
470 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
471 och->och_fh = body->handle;
472 och->och_fid = body->fid1;
473 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
474 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
475 och->och_flags = it->it_flags;
477 return md_set_open_replay_data(md_exp, och, req);
480 int ll_local_open(struct file *file, struct lookup_intent *it,
481 struct ll_file_data *fd, struct obd_client_handle *och)
483 struct inode *inode = file->f_dentry->d_inode;
484 struct ll_inode_info *lli = ll_i2info(inode);
487 LASSERT(!LUSTRE_FPRIVATE(file));
492 struct ptlrpc_request *req = it->d.lustre.it_data;
493 struct mdt_body *body;
496 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
500 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
501 ll_ioepoch_open(lli, body->ioepoch);
504 LUSTRE_FPRIVATE(file) = fd;
505 ll_readahead_init(inode, &fd->fd_ras);
506 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
511 /* Open a file, and (for the very first open) create objects on the OSTs at
512 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
513 * creation or open until ll_lov_setstripe() ioctl is called.
515 * If we already have the stripe MD locally then we don't request it in
516 * md_open(), by passing a lmm_size = 0.
518 * It is up to the application to ensure no other processes open this file
519 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
520 * used. We might be able to avoid races of that sort by getting lli_open_sem
521 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
522 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
524 int ll_file_open(struct inode *inode, struct file *file)
526 struct ll_inode_info *lli = ll_i2info(inode);
527 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
528 .it_flags = file->f_flags };
529 struct obd_client_handle **och_p = NULL;
530 __u64 *och_usecount = NULL;
531 struct ll_file_data *fd;
532 int rc = 0, opendir_set = 0;
535 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
536 inode->i_generation, inode, file->f_flags);
538 it = file->private_data; /* XXX: compat macro */
539 file->private_data = NULL; /* prevent ll_local_open assertion */
541 fd = ll_file_data_get();
543 GOTO(out_openerr, rc = -ENOMEM);
546 if (S_ISDIR(inode->i_mode)) {
547 spin_lock(&lli->lli_sa_lock);
548 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
549 lli->lli_opendir_pid == 0) {
550 lli->lli_opendir_key = fd;
551 lli->lli_opendir_pid = cfs_curproc_pid();
554 spin_unlock(&lli->lli_sa_lock);
557 if (inode->i_sb->s_root == file->f_dentry) {
558 LUSTRE_FPRIVATE(file) = fd;
562 if (!it || !it->d.lustre.it_disposition) {
563 /* Convert f_flags into access mode. We cannot use file->f_mode,
564 * because everything but O_ACCMODE mask was stripped from
566 if ((oit.it_flags + 1) & O_ACCMODE)
568 if (file->f_flags & O_TRUNC)
569 oit.it_flags |= FMODE_WRITE;
571 /* kernel only call f_op->open in dentry_open. filp_open calls
572 * dentry_open after call to open_namei that checks permissions.
573 * Only nfsd_open call dentry_open directly without checking
574 * permissions and because of that this code below is safe. */
575 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
576 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
578 /* We do not want O_EXCL here, presumably we opened the file
579 * already? XXX - NFS implications? */
580 oit.it_flags &= ~O_EXCL;
582 /* bug20584, if "it_flags" contains O_CREAT, the file will be
583 * created if necessary, then "IT_CREAT" should be set to keep
584 * consistent with it */
585 if (oit.it_flags & O_CREAT)
586 oit.it_op |= IT_CREAT;
592 /* Let's see if we have file open on MDS already. */
593 if (it->it_flags & FMODE_WRITE) {
594 och_p = &lli->lli_mds_write_och;
595 och_usecount = &lli->lli_open_fd_write_count;
596 } else if (it->it_flags & FMODE_EXEC) {
597 och_p = &lli->lli_mds_exec_och;
598 och_usecount = &lli->lli_open_fd_exec_count;
600 och_p = &lli->lli_mds_read_och;
601 och_usecount = &lli->lli_open_fd_read_count;
604 mutex_lock(&lli->lli_och_mutex);
605 if (*och_p) { /* Open handle is present */
606 if (it_disposition(it, DISP_OPEN_OPEN)) {
607 /* Well, there's extra open request that we do not need,
608 let's close it somehow. This will decref request. */
609 rc = it_open_error(DISP_OPEN_OPEN, it);
611 mutex_unlock(&lli->lli_och_mutex);
612 GOTO(out_openerr, rc);
615 ll_release_openhandle(file->f_dentry, it);
619 rc = ll_local_open(file, it, fd, NULL);
622 mutex_unlock(&lli->lli_och_mutex);
623 GOTO(out_openerr, rc);
626 LASSERT(*och_usecount == 0);
627 if (!it->d.lustre.it_disposition) {
628 /* We cannot just request lock handle now, new ELC code
629 means that one of other OPEN locks for this file
630 could be cancelled, and since blocking ast handler
631 would attempt to grab och_mutex as well, that would
632 result in a deadlock */
633 mutex_unlock(&lli->lli_och_mutex);
634 it->it_create_mode |= M_CHECK_STALE;
635 rc = ll_intent_file_open(file, NULL, 0, it);
636 it->it_create_mode &= ~M_CHECK_STALE;
638 GOTO(out_openerr, rc);
642 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
644 GOTO(out_och_free, rc = -ENOMEM);
648 /* md_intent_lock() didn't get a request ref if there was an
649 * open error, so don't do cleanup on the request here
651 /* XXX (green): Should not we bail out on any error here, not
652 * just open error? */
653 rc = it_open_error(DISP_OPEN_OPEN, it);
655 GOTO(out_och_free, rc);
657 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
659 rc = ll_local_open(file, it, fd, *och_p);
661 GOTO(out_och_free, rc);
663 mutex_unlock(&lli->lli_och_mutex);
666 /* Must do this outside lli_och_mutex lock to prevent deadlock where
667 different kind of OPEN lock for this same inode gets cancelled
668 by ldlm_cancel_lru */
669 if (!S_ISREG(inode->i_mode))
670 GOTO(out_och_free, rc);
674 if (!lli->lli_has_smd) {
675 if (file->f_flags & O_LOV_DELAY_CREATE ||
676 !(file->f_mode & FMODE_WRITE)) {
677 CDEBUG(D_INODE, "object creation was delayed\n");
678 GOTO(out_och_free, rc);
681 file->f_flags &= ~O_LOV_DELAY_CREATE;
682 GOTO(out_och_free, rc);
686 if (och_p && *och_p) {
687 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
688 *och_p = NULL; /* OBD_FREE writes some magic there */
691 mutex_unlock(&lli->lli_och_mutex);
694 if (opendir_set != 0)
695 ll_stop_statahead(inode, lli->lli_opendir_key);
697 ll_file_data_put(fd);
699 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
702 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
703 ptlrpc_req_finished(it->d.lustre.it_data);
704 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
710 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
711 struct ldlm_lock_desc *desc, void *data, int flag)
714 struct lustre_handle lockh;
718 case LDLM_CB_BLOCKING:
719 ldlm_lock2handle(lock, &lockh);
720 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
722 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
726 case LDLM_CB_CANCELING:
734 * Acquire a lease and open the file.
736 struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
739 struct lookup_intent it = { .it_op = IT_OPEN };
740 struct ll_sb_info *sbi = ll_i2sbi(inode);
741 struct md_op_data *op_data;
742 struct ptlrpc_request *req;
743 struct lustre_handle old_handle = { 0 };
744 struct obd_client_handle *och = NULL;
749 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
750 RETURN(ERR_PTR(-EINVAL));
753 struct ll_inode_info *lli = ll_i2info(inode);
754 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
755 struct obd_client_handle **och_p;
758 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
759 RETURN(ERR_PTR(-EPERM));
761 /* Get the openhandle of the file */
763 mutex_lock(&lli->lli_och_mutex);
764 if (fd->fd_lease_och != NULL) {
765 mutex_unlock(&lli->lli_och_mutex);
769 if (fd->fd_och == NULL) {
770 if (file->f_mode & FMODE_WRITE) {
771 LASSERT(lli->lli_mds_write_och != NULL);
772 och_p = &lli->lli_mds_write_och;
773 och_usecount = &lli->lli_open_fd_write_count;
775 LASSERT(lli->lli_mds_read_och != NULL);
776 och_p = &lli->lli_mds_read_och;
777 och_usecount = &lli->lli_open_fd_read_count;
779 if (*och_usecount == 1) {
786 mutex_unlock(&lli->lli_och_mutex);
787 if (rc < 0) /* more than 1 opener */
790 LASSERT(fd->fd_och != NULL);
791 old_handle = fd->fd_och->och_fh;
796 RETURN(ERR_PTR(-ENOMEM));
798 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
799 LUSTRE_OPC_ANY, NULL);
801 GOTO(out, rc = PTR_ERR(op_data));
803 /* To tell the MDT this openhandle is from the same owner */
804 op_data->op_handle = old_handle;
806 it.it_flags = fmode | MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
807 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
808 ll_md_blocking_lease_ast,
809 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
810 * it can be cancelled which may mislead applications that the lease is
812 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
813 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
814 * doesn't deal with openhandle, so normal openhandle will be leaked. */
815 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
816 ll_finish_md_op_data(op_data);
818 ptlrpc_req_finished(req);
819 it_clear_disposition(&it, DISP_ENQ_COMPLETE);
822 GOTO(out_release_it, rc);
824 if (it_disposition(&it, DISP_LOOKUP_NEG))
825 GOTO(out_release_it, rc = -ENOENT);
827 rc = it_open_error(DISP_OPEN_OPEN, &it);
829 GOTO(out_release_it, rc);
831 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
832 ll_och_fill(sbi->ll_md_exp, &it, och);
834 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
835 GOTO(out_close, rc = -EOPNOTSUPP);
837 /* already get lease, handle lease lock */
838 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
839 if (it.d.lustre.it_lock_mode == 0 ||
840 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
841 /* open lock must return for lease */
842 CERROR(DFID "lease granted but no open lock, %d/%Lu.\n",
843 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
844 it.d.lustre.it_lock_bits);
845 GOTO(out_close, rc = -EPROTO);
848 ll_intent_release(&it);
852 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och);
854 CERROR("Close openhandle returned %d\n", rc2);
856 /* cancel open lock */
857 if (it.d.lustre.it_lock_mode != 0) {
858 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
859 it.d.lustre.it_lock_mode);
860 it.d.lustre.it_lock_mode = 0;
863 ll_intent_release(&it);
868 EXPORT_SYMBOL(ll_lease_open);
871 * Release lease and close the file.
872 * It will check if the lease has ever broken.
874 int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
877 struct ldlm_lock *lock;
878 bool cancelled = true;
882 lock = ldlm_handle2lock(&och->och_lease_handle);
884 lock_res_and_lock(lock);
885 cancelled = ldlm_is_cancel(lock);
886 unlock_res_and_lock(lock);
890 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
891 PFID(&ll_i2info(inode)->lli_fid), cancelled);
894 ldlm_cli_cancel(&och->och_lease_handle, 0);
895 if (lease_broken != NULL)
896 *lease_broken = cancelled;
898 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och);
901 EXPORT_SYMBOL(ll_lease_close);
903 /* Fills the obdo with the attributes for the lsm */
904 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
905 struct obd_capa *capa, struct obdo *obdo,
906 __u64 ioepoch, int sync)
908 struct ptlrpc_request_set *set;
909 struct obd_info oinfo = { { { 0 } } };
914 LASSERT(lsm != NULL);
918 oinfo.oi_oa->o_oi = lsm->lsm_oi;
919 oinfo.oi_oa->o_mode = S_IFREG;
920 oinfo.oi_oa->o_ioepoch = ioepoch;
921 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
922 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
923 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
924 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
925 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
926 OBD_MD_FLDATAVERSION;
927 oinfo.oi_capa = capa;
929 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
930 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
933 set = ptlrpc_prep_set();
935 CERROR("can't allocate ptlrpc set\n");
938 rc = obd_getattr_async(exp, &oinfo, set);
940 rc = ptlrpc_set_wait(set);
941 ptlrpc_set_destroy(set);
944 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
945 OBD_MD_FLATIME | OBD_MD_FLMTIME |
946 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
947 OBD_MD_FLDATAVERSION);
952 * Performs the getattr on the inode and updates its fields.
953 * If @sync != 0, perform the getattr under the server-side lock.
955 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
956 __u64 ioepoch, int sync)
958 struct obd_capa *capa = ll_mdscapa_get(inode);
959 struct lov_stripe_md *lsm;
963 lsm = ccc_inode_lsm_get(inode);
964 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
965 capa, obdo, ioepoch, sync);
968 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
970 obdo_refresh_inode(inode, obdo, obdo->o_valid);
971 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
972 " blksize %lu\n", POSTID(oi), i_size_read(inode),
973 (unsigned long long)inode->i_blocks,
974 (unsigned long)ll_inode_blksize(inode));
976 ccc_inode_lsm_put(inode, lsm);
980 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
982 struct ll_inode_info *lli = ll_i2info(inode);
983 struct cl_object *obj = lli->lli_clob;
984 struct cl_attr *attr = ccc_env_thread_attr(env);
990 ll_inode_size_lock(inode);
991 /* merge timestamps the most recently obtained from mds with
992 timestamps obtained from osts */
993 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
994 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
995 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
996 inode_init_lvb(inode, &lvb);
998 cl_object_attr_lock(obj);
999 rc = cl_object_attr_get(env, obj, attr);
1000 cl_object_attr_unlock(obj);
1003 if (lvb.lvb_atime < attr->cat_atime)
1004 lvb.lvb_atime = attr->cat_atime;
1005 if (lvb.lvb_ctime < attr->cat_ctime)
1006 lvb.lvb_ctime = attr->cat_ctime;
1007 if (lvb.lvb_mtime < attr->cat_mtime)
1008 lvb.lvb_mtime = attr->cat_mtime;
1010 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1011 PFID(&lli->lli_fid), attr->cat_size);
1012 cl_isize_write_nolock(inode, attr->cat_size);
1014 inode->i_blocks = attr->cat_blocks;
1016 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1017 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1018 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1020 ll_inode_size_unlock(inode);
1025 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1028 struct obdo obdo = { 0 };
1031 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1033 st->st_size = obdo.o_size;
1034 st->st_blocks = obdo.o_blocks;
1035 st->st_mtime = obdo.o_mtime;
1036 st->st_atime = obdo.o_atime;
1037 st->st_ctime = obdo.o_ctime;
1042 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1044 struct inode *inode = file->f_dentry->d_inode;
1046 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1048 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1049 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1050 file->f_flags & O_DIRECT ||
1053 io->ci_obj = ll_i2info(inode)->lli_clob;
1054 io->ci_lockreq = CILR_MAYBE;
1055 if (ll_file_nolock(file)) {
1056 io->ci_lockreq = CILR_NEVER;
1057 io->ci_no_srvlock = 1;
1058 } else if (file->f_flags & O_APPEND) {
1059 io->ci_lockreq = CILR_MANDATORY;
1064 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1065 struct file *file, enum cl_io_type iot,
1066 loff_t *ppos, size_t count)
1068 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1069 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1075 io = ccc_env_thread_io(env);
1076 ll_io_init(io, file, iot == CIT_WRITE);
1078 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1079 struct vvp_io *vio = vvp_env_io(env);
1080 struct ccc_io *cio = ccc_env_io(env);
1081 int write_mutex_locked = 0;
1083 cio->cui_fd = LUSTRE_FPRIVATE(file);
1084 vio->cui_io_subtype = args->via_io_subtype;
1086 switch (vio->cui_io_subtype) {
1088 cio->cui_iov = args->u.normal.via_iov;
1089 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1090 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1091 cio->cui_iocb = args->u.normal.via_iocb;
1092 if ((iot == CIT_WRITE) &&
1093 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1094 if (mutex_lock_interruptible(&lli->
1096 GOTO(out, result = -ERESTARTSYS);
1097 write_mutex_locked = 1;
1098 } else if (iot == CIT_READ) {
1099 down_read(&lli->lli_trunc_sem);
1103 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1104 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1107 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1108 vio->u.splice.cui_flags = args->u.splice.via_flags;
1111 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1114 result = cl_io_loop(env, io);
1115 if (write_mutex_locked)
1116 mutex_unlock(&lli->lli_write_mutex);
1117 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1118 up_read(&lli->lli_trunc_sem);
1120 /* cl_io_rw_init() handled IO */
1121 result = io->ci_result;
1124 if (io->ci_nob > 0) {
1125 result = io->ci_nob;
1126 *ppos = io->u.ci_wr.wr.crw_pos;
1130 cl_io_fini(env, io);
1131 /* If any bit been read/written (result != 0), we just return
1132 * short read/write instead of restart io. */
1133 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1134 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1135 iot == CIT_READ ? "read" : "write",
1136 file->f_dentry->d_name.name, *ppos, count);
1137 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1141 if (iot == CIT_READ) {
1143 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1144 LPROC_LL_READ_BYTES, result);
1145 } else if (iot == CIT_WRITE) {
1147 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1148 LPROC_LL_WRITE_BYTES, result);
1149 fd->fd_write_failed = false;
1150 } else if (result != -ERESTARTSYS) {
1151 fd->fd_write_failed = true;
1160 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1162 static int ll_file_get_iov_count(const struct iovec *iov,
1163 unsigned long *nr_segs, size_t *count)
1168 for (seg = 0; seg < *nr_segs; seg++) {
1169 const struct iovec *iv = &iov[seg];
1172 * If any segment has a negative length, or the cumulative
1173 * length ever wraps negative then return -EINVAL.
1176 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1178 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1183 cnt -= iv->iov_len; /* This segment is no good */
1190 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1191 unsigned long nr_segs, loff_t pos)
1194 struct vvp_io_args *args;
1200 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1204 env = cl_env_get(&refcheck);
1206 RETURN(PTR_ERR(env));
1208 args = vvp_env_args(env, IO_NORMAL);
1209 args->u.normal.via_iov = (struct iovec *)iov;
1210 args->u.normal.via_nrsegs = nr_segs;
1211 args->u.normal.via_iocb = iocb;
1213 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1214 &iocb->ki_pos, count);
1215 cl_env_put(env, &refcheck);
1219 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1223 struct iovec *local_iov;
1224 struct kiocb *kiocb;
1229 env = cl_env_get(&refcheck);
1231 RETURN(PTR_ERR(env));
1233 local_iov = &vvp_env_info(env)->vti_local_iov;
1234 kiocb = &vvp_env_info(env)->vti_kiocb;
1235 local_iov->iov_base = (void __user *)buf;
1236 local_iov->iov_len = count;
1237 init_sync_kiocb(kiocb, file);
1238 kiocb->ki_pos = *ppos;
1239 kiocb->ki_left = count;
1241 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1242 *ppos = kiocb->ki_pos;
1244 cl_env_put(env, &refcheck);
1249 * Write to a file (through the page cache).
1252 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1253 unsigned long nr_segs, loff_t pos)
1256 struct vvp_io_args *args;
1262 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1266 env = cl_env_get(&refcheck);
1268 RETURN(PTR_ERR(env));
1270 args = vvp_env_args(env, IO_NORMAL);
1271 args->u.normal.via_iov = (struct iovec *)iov;
1272 args->u.normal.via_nrsegs = nr_segs;
1273 args->u.normal.via_iocb = iocb;
1275 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1276 &iocb->ki_pos, count);
1277 cl_env_put(env, &refcheck);
1281 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1285 struct iovec *local_iov;
1286 struct kiocb *kiocb;
1291 env = cl_env_get(&refcheck);
1293 RETURN(PTR_ERR(env));
1295 local_iov = &vvp_env_info(env)->vti_local_iov;
1296 kiocb = &vvp_env_info(env)->vti_kiocb;
1297 local_iov->iov_base = (void __user *)buf;
1298 local_iov->iov_len = count;
1299 init_sync_kiocb(kiocb, file);
1300 kiocb->ki_pos = *ppos;
1301 kiocb->ki_left = count;
1303 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1304 *ppos = kiocb->ki_pos;
1306 cl_env_put(env, &refcheck);
1311 * Send file content (through pagecache) somewhere with helper
1313 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1314 struct pipe_inode_info *pipe, size_t count,
1318 struct vvp_io_args *args;
1323 env = cl_env_get(&refcheck);
1325 RETURN(PTR_ERR(env));
1327 args = vvp_env_args(env, IO_SPLICE);
1328 args->u.splice.via_pipe = pipe;
1329 args->u.splice.via_flags = flags;
1331 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1332 cl_env_put(env, &refcheck);
1336 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1339 struct obd_export *exp = ll_i2dtexp(inode);
1340 struct obd_trans_info oti = { 0 };
1341 struct obdo *oa = NULL;
1344 struct lov_stripe_md *lsm = NULL, *lsm2;
1351 lsm = ccc_inode_lsm_get(inode);
1352 if (!lsm_has_objects(lsm))
1353 GOTO(out, rc = -ENOENT);
1355 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1356 (lsm->lsm_stripe_count));
1358 OBD_ALLOC_LARGE(lsm2, lsm_size);
1360 GOTO(out, rc = -ENOMEM);
1363 oa->o_nlink = ost_idx;
1364 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1365 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1366 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1367 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1368 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1369 memcpy(lsm2, lsm, lsm_size);
1370 ll_inode_size_lock(inode);
1371 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1372 ll_inode_size_unlock(inode);
1374 OBD_FREE_LARGE(lsm2, lsm_size);
1377 ccc_inode_lsm_put(inode, lsm);
1382 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1384 struct ll_recreate_obj ucreat;
1388 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1391 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1395 ostid_set_seq_mdt0(&oi);
1396 ostid_set_id(&oi, ucreat.lrc_id);
1397 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1400 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1407 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1410 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1413 fid_to_ostid(&fid, &oi);
1414 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1415 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1418 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1419 int flags, struct lov_user_md *lum, int lum_size)
1421 struct lov_stripe_md *lsm = NULL;
1422 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1426 lsm = ccc_inode_lsm_get(inode);
1428 ccc_inode_lsm_put(inode, lsm);
1429 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1434 ll_inode_size_lock(inode);
1435 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1438 rc = oit.d.lustre.it_status;
1440 GOTO(out_req_free, rc);
1442 ll_release_openhandle(file->f_dentry, &oit);
1445 ll_inode_size_unlock(inode);
1446 ll_intent_release(&oit);
1447 ccc_inode_lsm_put(inode, lsm);
1450 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1454 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1455 struct lov_mds_md **lmmp, int *lmm_size,
1456 struct ptlrpc_request **request)
1458 struct ll_sb_info *sbi = ll_i2sbi(inode);
1459 struct mdt_body *body;
1460 struct lov_mds_md *lmm = NULL;
1461 struct ptlrpc_request *req = NULL;
1462 struct md_op_data *op_data;
1465 rc = ll_get_max_mdsize(sbi, &lmmsize);
1469 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1470 strlen(filename), lmmsize,
1471 LUSTRE_OPC_ANY, NULL);
1472 if (IS_ERR(op_data))
1473 RETURN(PTR_ERR(op_data));
1475 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1476 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1477 ll_finish_md_op_data(op_data);
1479 CDEBUG(D_INFO, "md_getattr_name failed "
1480 "on %s: rc %d\n", filename, rc);
1484 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1485 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1487 lmmsize = body->eadatasize;
1489 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1491 GOTO(out, rc = -ENODATA);
1494 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1495 LASSERT(lmm != NULL);
1497 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1498 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1499 GOTO(out, rc = -EPROTO);
1503 * This is coming from the MDS, so is probably in
1504 * little endian. We convert it to host endian before
1505 * passing it to userspace.
1507 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1510 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1511 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1514 /* if function called for directory - we should
1515 * avoid swab not existent lsm objects */
1516 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1517 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1518 if (S_ISREG(body->mode))
1519 lustre_swab_lov_user_md_objects(
1520 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1522 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1523 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1524 if (S_ISREG(body->mode))
1525 lustre_swab_lov_user_md_objects(
1526 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1533 *lmm_size = lmmsize;
1538 static int ll_lov_setea(struct inode *inode, struct file *file,
1541 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1542 struct lov_user_md *lump;
1543 int lum_size = sizeof(struct lov_user_md) +
1544 sizeof(struct lov_user_ost_data);
1548 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1551 OBD_ALLOC_LARGE(lump, lum_size);
1555 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1556 OBD_FREE_LARGE(lump, lum_size);
1560 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1562 OBD_FREE_LARGE(lump, lum_size);
1566 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1569 struct lov_user_md_v3 lumv3;
1570 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1571 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1572 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1574 int flags = FMODE_WRITE;
1577 /* first try with v1 which is smaller than v3 */
1578 lum_size = sizeof(struct lov_user_md_v1);
1579 if (copy_from_user(lumv1, lumv1p, lum_size))
1582 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1583 lum_size = sizeof(struct lov_user_md_v3);
1584 if (copy_from_user(&lumv3, lumv3p, lum_size))
1588 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1590 struct lov_stripe_md *lsm;
1593 put_user(0, &lumv1p->lmm_stripe_count);
1595 ll_layout_refresh(inode, &gen);
1596 lsm = ccc_inode_lsm_get(inode);
1597 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1598 0, lsm, (void *)arg);
1599 ccc_inode_lsm_put(inode, lsm);
1604 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1606 struct lov_stripe_md *lsm;
1610 lsm = ccc_inode_lsm_get(inode);
1612 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1614 ccc_inode_lsm_put(inode, lsm);
1618 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1620 struct ll_inode_info *lli = ll_i2info(inode);
1621 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1622 struct ccc_grouplock grouplock;
1626 if (ll_file_nolock(file))
1627 RETURN(-EOPNOTSUPP);
1629 spin_lock(&lli->lli_lock);
1630 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1631 CWARN("group lock already existed with gid %lu\n",
1632 fd->fd_grouplock.cg_gid);
1633 spin_unlock(&lli->lli_lock);
1636 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1637 spin_unlock(&lli->lli_lock);
1639 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1640 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1644 spin_lock(&lli->lli_lock);
1645 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1646 spin_unlock(&lli->lli_lock);
1647 CERROR("another thread just won the race\n");
1648 cl_put_grouplock(&grouplock);
1652 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1653 fd->fd_grouplock = grouplock;
1654 spin_unlock(&lli->lli_lock);
1656 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1660 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1662 struct ll_inode_info *lli = ll_i2info(inode);
1663 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1664 struct ccc_grouplock grouplock;
1667 spin_lock(&lli->lli_lock);
1668 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1669 spin_unlock(&lli->lli_lock);
1670 CWARN("no group lock held\n");
1673 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1675 if (fd->fd_grouplock.cg_gid != arg) {
1676 CWARN("group lock %lu doesn't match current id %lu\n",
1677 arg, fd->fd_grouplock.cg_gid);
1678 spin_unlock(&lli->lli_lock);
1682 grouplock = fd->fd_grouplock;
1683 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1684 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1685 spin_unlock(&lli->lli_lock);
1687 cl_put_grouplock(&grouplock);
1688 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1693 * Close inode open handle
1695 * \param dentry [in] dentry which contains the inode
1696 * \param it [in,out] intent which contains open info and result
1699 * \retval <0 failure
1701 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1703 struct inode *inode = dentry->d_inode;
1704 struct obd_client_handle *och;
1710 /* Root ? Do nothing. */
1711 if (dentry->d_inode->i_sb->s_root == dentry)
1714 /* No open handle to close? Move away */
1715 if (!it_disposition(it, DISP_OPEN_OPEN))
1718 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1720 OBD_ALLOC(och, sizeof(*och));
1722 GOTO(out, rc = -ENOMEM);
1724 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1726 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1729 /* this one is in place of ll_file_open */
1730 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1731 ptlrpc_req_finished(it->d.lustre.it_data);
1732 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1738 * Get size for inode for which FIEMAP mapping is requested.
1739 * Make the FIEMAP get_info call and returns the result.
1741 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1744 struct obd_export *exp = ll_i2dtexp(inode);
1745 struct lov_stripe_md *lsm = NULL;
1746 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1747 int vallen = num_bytes;
1751 /* Checks for fiemap flags */
1752 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1753 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1757 /* Check for FIEMAP_FLAG_SYNC */
1758 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1759 rc = filemap_fdatawrite(inode->i_mapping);
1764 lsm = ccc_inode_lsm_get(inode);
1768 /* If the stripe_count > 1 and the application does not understand
1769 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1771 if (lsm->lsm_stripe_count > 1 &&
1772 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1773 GOTO(out, rc = -EOPNOTSUPP);
1775 fm_key.oa.o_oi = lsm->lsm_oi;
1776 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1778 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1779 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1780 /* If filesize is 0, then there would be no objects for mapping */
1781 if (fm_key.oa.o_size == 0) {
1782 fiemap->fm_mapped_extents = 0;
1786 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1788 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1791 CERROR("obd_get_info failed: rc = %d\n", rc);
1794 ccc_inode_lsm_put(inode, lsm);
1798 int ll_fid2path(struct inode *inode, void *arg)
1800 struct obd_export *exp = ll_i2mdexp(inode);
1801 struct getinfo_fid2path *gfout, *gfin;
1805 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1806 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1809 /* Need to get the buflen */
1810 OBD_ALLOC_PTR(gfin);
1813 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1818 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1819 OBD_ALLOC(gfout, outsize);
1820 if (gfout == NULL) {
1824 memcpy(gfout, gfin, sizeof(*gfout));
1827 /* Call mdc_iocontrol */
1828 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1832 if (copy_to_user(arg, gfout, outsize))
1836 OBD_FREE(gfout, outsize);
1840 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1842 struct ll_user_fiemap *fiemap_s;
1843 size_t num_bytes, ret_bytes;
1844 unsigned int extent_count;
1847 /* Get the extent count so we can calculate the size of
1848 * required fiemap buffer */
1849 if (get_user(extent_count,
1850 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1852 num_bytes = sizeof(*fiemap_s) + (extent_count *
1853 sizeof(struct ll_fiemap_extent));
1855 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1856 if (fiemap_s == NULL)
1859 /* get the fiemap value */
1860 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1862 GOTO(error, rc = -EFAULT);
1864 /* If fm_extent_count is non-zero, read the first extent since
1865 * it is used to calculate end_offset and device from previous
1868 if (copy_from_user(&fiemap_s->fm_extents[0],
1869 (char __user *)arg + sizeof(*fiemap_s),
1870 sizeof(struct ll_fiemap_extent)))
1871 GOTO(error, rc = -EFAULT);
1874 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1878 ret_bytes = sizeof(struct ll_user_fiemap);
1880 if (extent_count != 0)
1881 ret_bytes += (fiemap_s->fm_mapped_extents *
1882 sizeof(struct ll_fiemap_extent));
1884 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1888 OBD_FREE_LARGE(fiemap_s, num_bytes);
1893 * Read the data_version for inode.
1895 * This value is computed using stripe object version on OST.
1896 * Version is computed using server side locking.
1898 * @param extent_lock Take extent lock. Not needed if a process is already
1899 * holding the OST object group locks.
1901 int ll_data_version(struct inode *inode, __u64 *data_version,
1904 struct lov_stripe_md *lsm = NULL;
1905 struct ll_sb_info *sbi = ll_i2sbi(inode);
1906 struct obdo *obdo = NULL;
1910 /* If no stripe, we consider version is 0. */
1911 lsm = ccc_inode_lsm_get(inode);
1912 if (!lsm_has_objects(lsm)) {
1914 CDEBUG(D_INODE, "No object for inode\n");
1918 OBD_ALLOC_PTR(obdo);
1920 GOTO(out, rc = -ENOMEM);
1922 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1924 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1927 *data_version = obdo->o_data_version;
1933 ccc_inode_lsm_put(inode, lsm);
1937 struct ll_swap_stack {
1938 struct iattr ia1, ia2;
1940 struct inode *inode1, *inode2;
1941 bool check_dv1, check_dv2;
1944 static int ll_swap_layouts(struct file *file1, struct file *file2,
1945 struct lustre_swap_layouts *lsl)
1947 struct mdc_swap_layouts msl;
1948 struct md_op_data *op_data;
1951 struct ll_swap_stack *llss = NULL;
1954 OBD_ALLOC_PTR(llss);
1958 llss->inode1 = file1->f_dentry->d_inode;
1959 llss->inode2 = file2->f_dentry->d_inode;
1961 if (!S_ISREG(llss->inode2->i_mode))
1962 GOTO(free, rc = -EINVAL);
1964 if (inode_permission(llss->inode1, MAY_WRITE) ||
1965 inode_permission(llss->inode2, MAY_WRITE))
1966 GOTO(free, rc = -EPERM);
1968 if (llss->inode2->i_sb != llss->inode1->i_sb)
1969 GOTO(free, rc = -EXDEV);
1971 /* we use 2 bool because it is easier to swap than 2 bits */
1972 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1973 llss->check_dv1 = true;
1975 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1976 llss->check_dv2 = true;
1978 /* we cannot use lsl->sl_dvX directly because we may swap them */
1979 llss->dv1 = lsl->sl_dv1;
1980 llss->dv2 = lsl->sl_dv2;
1982 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1983 if (rc == 0) /* same file, done! */
1986 if (rc < 0) { /* sequentialize it */
1987 swap(llss->inode1, llss->inode2);
1989 swap(llss->dv1, llss->dv2);
1990 swap(llss->check_dv1, llss->check_dv2);
1994 if (gid != 0) { /* application asks to flush dirty cache */
1995 rc = ll_get_grouplock(llss->inode1, file1, gid);
1999 rc = ll_get_grouplock(llss->inode2, file2, gid);
2001 ll_put_grouplock(llss->inode1, file1, gid);
2006 /* to be able to restore mtime and atime after swap
2007 * we need to first save them */
2009 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2010 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2011 llss->ia1.ia_atime = llss->inode1->i_atime;
2012 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2013 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2014 llss->ia2.ia_atime = llss->inode2->i_atime;
2015 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2018 /* ultimate check, before swaping the layouts we check if
2019 * dataversion has changed (if requested) */
2020 if (llss->check_dv1) {
2021 rc = ll_data_version(llss->inode1, &dv, 0);
2024 if (dv != llss->dv1)
2025 GOTO(putgl, rc = -EAGAIN);
2028 if (llss->check_dv2) {
2029 rc = ll_data_version(llss->inode2, &dv, 0);
2032 if (dv != llss->dv2)
2033 GOTO(putgl, rc = -EAGAIN);
2036 /* struct md_op_data is used to send the swap args to the mdt
2037 * only flags is missing, so we use struct mdc_swap_layouts
2038 * through the md_op_data->op_data */
2039 /* flags from user space have to be converted before they are send to
2040 * server, no flag is sent today, they are only used on the client */
2043 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2044 0, LUSTRE_OPC_ANY, &msl);
2045 if (IS_ERR(op_data))
2046 GOTO(free, rc = PTR_ERR(op_data));
2048 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2049 sizeof(*op_data), op_data, NULL);
2050 ll_finish_md_op_data(op_data);
2054 ll_put_grouplock(llss->inode2, file2, gid);
2055 ll_put_grouplock(llss->inode1, file1, gid);
2058 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2062 /* clear useless flags */
2063 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2064 llss->ia1.ia_valid &= ~ATTR_MTIME;
2065 llss->ia2.ia_valid &= ~ATTR_MTIME;
2068 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2069 llss->ia1.ia_valid &= ~ATTR_ATIME;
2070 llss->ia2.ia_valid &= ~ATTR_ATIME;
2073 /* update time if requested */
2075 if (llss->ia2.ia_valid != 0) {
2076 mutex_lock(&llss->inode1->i_mutex);
2077 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2078 mutex_unlock(&llss->inode1->i_mutex);
2081 if (llss->ia1.ia_valid != 0) {
2084 mutex_lock(&llss->inode2->i_mutex);
2085 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2086 mutex_unlock(&llss->inode2->i_mutex);
2098 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2100 struct inode *inode = file->f_dentry->d_inode;
2101 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2105 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2106 inode->i_generation, inode, cmd);
2107 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2109 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2110 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2114 case LL_IOC_GETFLAGS:
2115 /* Get the current value of the file flags */
2116 return put_user(fd->fd_flags, (int *)arg);
2117 case LL_IOC_SETFLAGS:
2118 case LL_IOC_CLRFLAGS:
2119 /* Set or clear specific file flags */
2120 /* XXX This probably needs checks to ensure the flags are
2121 * not abused, and to handle any flag side effects.
2123 if (get_user(flags, (int *) arg))
2126 if (cmd == LL_IOC_SETFLAGS) {
2127 if ((flags & LL_FILE_IGNORE_LOCK) &&
2128 !(file->f_flags & O_DIRECT)) {
2129 CERROR("%s: unable to disable locking on "
2130 "non-O_DIRECT file\n", current->comm);
2134 fd->fd_flags |= flags;
2136 fd->fd_flags &= ~flags;
2139 case LL_IOC_LOV_SETSTRIPE:
2140 RETURN(ll_lov_setstripe(inode, file, arg));
2141 case LL_IOC_LOV_SETEA:
2142 RETURN(ll_lov_setea(inode, file, arg));
2143 case LL_IOC_LOV_SWAP_LAYOUTS: {
2145 struct lustre_swap_layouts lsl;
2147 if (copy_from_user(&lsl, (char *)arg,
2148 sizeof(struct lustre_swap_layouts)))
2151 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2154 file2 = fget(lsl.sl_fd);
2159 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2160 rc = ll_swap_layouts(file, file2, &lsl);
2164 case LL_IOC_LOV_GETSTRIPE:
2165 RETURN(ll_lov_getstripe(inode, arg));
2166 case LL_IOC_RECREATE_OBJ:
2167 RETURN(ll_lov_recreate_obj(inode, arg));
2168 case LL_IOC_RECREATE_FID:
2169 RETURN(ll_lov_recreate_fid(inode, arg));
2170 case FSFILT_IOC_FIEMAP:
2171 RETURN(ll_ioctl_fiemap(inode, arg));
2172 case FSFILT_IOC_GETFLAGS:
2173 case FSFILT_IOC_SETFLAGS:
2174 RETURN(ll_iocontrol(inode, file, cmd, arg));
2175 case FSFILT_IOC_GETVERSION_OLD:
2176 case FSFILT_IOC_GETVERSION:
2177 RETURN(put_user(inode->i_generation, (int *)arg));
2178 case LL_IOC_GROUP_LOCK:
2179 RETURN(ll_get_grouplock(inode, file, arg));
2180 case LL_IOC_GROUP_UNLOCK:
2181 RETURN(ll_put_grouplock(inode, file, arg));
2182 case IOC_OBD_STATFS:
2183 RETURN(ll_obd_statfs(inode, (void *)arg));
2185 /* We need to special case any other ioctls we want to handle,
2186 * to send them to the MDS/OST as appropriate and to properly
2187 * network encode the arg field.
2188 case FSFILT_IOC_SETVERSION_OLD:
2189 case FSFILT_IOC_SETVERSION:
2191 case LL_IOC_FLUSHCTX:
2192 RETURN(ll_flush_ctx(inode));
2193 case LL_IOC_PATH2FID: {
2194 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2195 sizeof(struct lu_fid)))
2200 case OBD_IOC_FID2PATH:
2201 RETURN(ll_fid2path(inode, (void *)arg));
2202 case LL_IOC_DATA_VERSION: {
2203 struct ioc_data_version idv;
2206 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2209 rc = ll_data_version(inode, &idv.idv_version,
2210 !(idv.idv_flags & LL_DV_NOFLUSH));
2212 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2218 case LL_IOC_GET_MDTIDX: {
2221 mdtidx = ll_get_mdt_idx(inode);
2225 if (put_user((int)mdtidx, (int*)arg))
2230 case OBD_IOC_GETDTNAME:
2231 case OBD_IOC_GETMDNAME:
2232 RETURN(ll_get_obd_name(inode, cmd, arg));
2233 case LL_IOC_HSM_STATE_GET: {
2234 struct md_op_data *op_data;
2235 struct hsm_user_state *hus;
2242 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2243 LUSTRE_OPC_ANY, hus);
2244 if (IS_ERR(op_data)) {
2246 RETURN(PTR_ERR(op_data));
2249 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2252 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2255 ll_finish_md_op_data(op_data);
2259 case LL_IOC_HSM_STATE_SET: {
2260 struct md_op_data *op_data;
2261 struct hsm_state_set *hss;
2267 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2272 /* Non-root users are forbidden to set or clear flags which are
2273 * NOT defined in HSM_USER_MASK. */
2274 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2275 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2280 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2281 LUSTRE_OPC_ANY, hss);
2282 if (IS_ERR(op_data)) {
2284 RETURN(PTR_ERR(op_data));
2287 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2290 ll_finish_md_op_data(op_data);
2295 case LL_IOC_HSM_ACTION: {
2296 struct md_op_data *op_data;
2297 struct hsm_current_action *hca;
2304 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2305 LUSTRE_OPC_ANY, hca);
2306 if (IS_ERR(op_data)) {
2308 RETURN(PTR_ERR(op_data));
2311 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2314 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2317 ll_finish_md_op_data(op_data);
2321 case LL_IOC_SET_LEASE: {
2322 struct ll_inode_info *lli = ll_i2info(inode);
2323 struct obd_client_handle *och = NULL;
2329 if (!(file->f_mode & FMODE_WRITE))
2334 if (!(file->f_mode & FMODE_READ))
2339 mutex_lock(&lli->lli_och_mutex);
2340 if (fd->fd_lease_och != NULL) {
2341 och = fd->fd_lease_och;
2342 fd->fd_lease_och = NULL;
2344 mutex_unlock(&lli->lli_och_mutex);
2347 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2348 rc = ll_lease_close(och, inode, &lease_broken);
2349 if (rc == 0 && lease_broken)
2355 /* return the type of lease or error */
2356 RETURN(rc < 0 ? rc : (int)mode);
2361 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2363 /* apply for lease */
2364 och = ll_lease_open(inode, file, mode);
2366 RETURN(PTR_ERR(och));
2369 mutex_lock(&lli->lli_och_mutex);
2370 if (fd->fd_lease_och == NULL) {
2371 fd->fd_lease_och = och;
2374 mutex_unlock(&lli->lli_och_mutex);
2376 /* impossible now that only excl is supported for now */
2377 ll_lease_close(och, inode, &lease_broken);
2382 case LL_IOC_GET_LEASE: {
2383 struct ll_inode_info *lli = ll_i2info(inode);
2384 struct ldlm_lock *lock = NULL;
2387 mutex_lock(&lli->lli_och_mutex);
2388 if (fd->fd_lease_och != NULL) {
2389 struct obd_client_handle *och = fd->fd_lease_och;
2391 lock = ldlm_handle2lock(&och->och_lease_handle);
2393 lock_res_and_lock(lock);
2394 if (!ldlm_is_cancel(lock))
2395 rc = och->och_flags &
2396 (FMODE_READ | FMODE_WRITE);
2397 unlock_res_and_lock(lock);
2398 ldlm_lock_put(lock);
2401 mutex_unlock(&lli->lli_och_mutex);
2409 ll_iocontrol_call(inode, file, cmd, arg, &err))
2412 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2418 #ifndef HAVE_FILE_LLSEEK_SIZE
2419 static inline loff_t
2420 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2422 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2424 if (offset > maxsize)
2427 if (offset != file->f_pos) {
2428 file->f_pos = offset;
2429 file->f_version = 0;
2435 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2436 loff_t maxsize, loff_t eof)
2438 struct inode *inode = file->f_dentry->d_inode;
2446 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2447 * position-querying operation. Avoid rewriting the "same"
2448 * f_pos value back to the file because a concurrent read(),
2449 * write() or lseek() might have altered it
2454 * f_lock protects against read/modify/write race with other
2455 * SEEK_CURs. Note that parallel writes and reads behave
2458 mutex_lock(&inode->i_mutex);
2459 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2460 mutex_unlock(&inode->i_mutex);
2464 * In the generic case the entire file is data, so as long as
2465 * offset isn't at the end of the file then the offset is data.
2472 * There is a virtual hole at the end of the file, so as long as
2473 * offset isn't i_size or larger, return i_size.
2481 return llseek_execute(file, offset, maxsize);
2485 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2487 struct inode *inode = file->f_dentry->d_inode;
2488 loff_t retval, eof = 0;
2491 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2492 (origin == SEEK_CUR) ? file->f_pos : 0);
2493 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2494 inode->i_ino, inode->i_generation, inode, retval, retval,
2496 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2498 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2499 retval = ll_glimpse_size(inode);
2502 eof = i_size_read(inode);
2505 retval = ll_generic_file_llseek_size(file, offset, origin,
2506 ll_file_maxbytes(inode), eof);
2510 int ll_flush(struct file *file, fl_owner_t id)
2512 struct inode *inode = file->f_dentry->d_inode;
2513 struct ll_inode_info *lli = ll_i2info(inode);
2514 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2517 LASSERT(!S_ISDIR(inode->i_mode));
2519 /* catch async errors that were recorded back when async writeback
2520 * failed for pages in this mapping. */
2521 rc = lli->lli_async_rc;
2522 lli->lli_async_rc = 0;
2523 err = lov_read_and_clear_async_rc(lli->lli_clob);
2527 /* The application has been told write failure already.
2528 * Do not report failure again. */
2529 if (fd->fd_write_failed)
2531 return rc ? -EIO : 0;
2535 * Called to make sure a portion of file has been written out.
2536 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2538 * Return how many pages have been written.
2540 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2541 enum cl_fsync_mode mode, int ignore_layout)
2543 struct cl_env_nest nest;
2546 struct obd_capa *capa = NULL;
2547 struct cl_fsync_io *fio;
2551 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2552 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2555 env = cl_env_nested_get(&nest);
2557 RETURN(PTR_ERR(env));
2559 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2561 io = ccc_env_thread_io(env);
2562 io->ci_obj = cl_i2info(inode)->lli_clob;
2563 io->ci_ignore_layout = ignore_layout;
2565 /* initialize parameters for sync */
2566 fio = &io->u.ci_fsync;
2567 fio->fi_capa = capa;
2568 fio->fi_start = start;
2570 fio->fi_fid = ll_inode2fid(inode);
2571 fio->fi_mode = mode;
2572 fio->fi_nr_written = 0;
2574 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2575 result = cl_io_loop(env, io);
2577 result = io->ci_result;
2579 result = fio->fi_nr_written;
2580 cl_io_fini(env, io);
2581 cl_env_nested_put(&nest, env);
2589 * When dentry is provided (the 'else' case), *file->f_dentry may be
2590 * null and dentry must be used directly rather than pulled from
2591 * *file->f_dentry as is done otherwise.
2594 #ifdef HAVE_FILE_FSYNC_4ARGS
2595 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2597 struct dentry *dentry = file->f_dentry;
2598 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2599 int ll_fsync(struct file *file, int datasync)
2601 struct dentry *dentry = file->f_dentry;
2603 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2606 struct inode *inode = dentry->d_inode;
2607 struct ll_inode_info *lli = ll_i2info(inode);
2608 struct ptlrpc_request *req;
2609 struct obd_capa *oc;
2613 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2614 inode->i_generation, inode);
2615 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2617 #ifdef HAVE_FILE_FSYNC_4ARGS
2618 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2619 mutex_lock(&inode->i_mutex);
2621 /* fsync's caller has already called _fdata{sync,write}, we want
2622 * that IO to finish before calling the osc and mdc sync methods */
2623 rc = filemap_fdatawait(inode->i_mapping);
2626 /* catch async errors that were recorded back when async writeback
2627 * failed for pages in this mapping. */
2628 if (!S_ISDIR(inode->i_mode)) {
2629 err = lli->lli_async_rc;
2630 lli->lli_async_rc = 0;
2633 err = lov_read_and_clear_async_rc(lli->lli_clob);
2638 oc = ll_mdscapa_get(inode);
2639 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2645 ptlrpc_req_finished(req);
2647 if (datasync && S_ISREG(inode->i_mode)) {
2648 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2650 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2652 if (rc == 0 && err < 0)
2655 fd->fd_write_failed = true;
2657 fd->fd_write_failed = false;
2660 #ifdef HAVE_FILE_FSYNC_4ARGS
2661 mutex_unlock(&inode->i_mutex);
2666 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2668 struct inode *inode = file->f_dentry->d_inode;
2669 struct ll_sb_info *sbi = ll_i2sbi(inode);
2670 struct ldlm_enqueue_info einfo = {
2671 .ei_type = LDLM_FLOCK,
2672 .ei_cb_cp = ldlm_flock_completion_ast,
2673 .ei_cbdata = file_lock,
2675 struct md_op_data *op_data;
2676 struct lustre_handle lockh = {0};
2677 ldlm_policy_data_t flock = {{0}};
2683 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2684 inode->i_ino, file_lock);
2686 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2688 if (file_lock->fl_flags & FL_FLOCK) {
2689 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2690 /* flocks are whole-file locks */
2691 flock.l_flock.end = OFFSET_MAX;
2692 /* For flocks owner is determined by the local file desctiptor*/
2693 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2694 } else if (file_lock->fl_flags & FL_POSIX) {
2695 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2696 flock.l_flock.start = file_lock->fl_start;
2697 flock.l_flock.end = file_lock->fl_end;
2701 flock.l_flock.pid = file_lock->fl_pid;
2703 /* Somewhat ugly workaround for svc lockd.
2704 * lockd installs custom fl_lmops->lm_compare_owner that checks
2705 * for the fl_owner to be the same (which it always is on local node
2706 * I guess between lockd processes) and then compares pid.
2707 * As such we assign pid to the owner field to make it all work,
2708 * conflict with normal locks is unlikely since pid space and
2709 * pointer space for current->files are not intersecting */
2710 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2711 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2713 switch (file_lock->fl_type) {
2715 einfo.ei_mode = LCK_PR;
2718 /* An unlock request may or may not have any relation to
2719 * existing locks so we may not be able to pass a lock handle
2720 * via a normal ldlm_lock_cancel() request. The request may even
2721 * unlock a byte range in the middle of an existing lock. In
2722 * order to process an unlock request we need all of the same
2723 * information that is given with a normal read or write record
2724 * lock request. To avoid creating another ldlm unlock (cancel)
2725 * message we'll treat a LCK_NL flock request as an unlock. */
2726 einfo.ei_mode = LCK_NL;
2729 einfo.ei_mode = LCK_PW;
2732 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2733 file_lock->fl_type);
2748 flags = LDLM_FL_BLOCK_NOWAIT;
2754 flags = LDLM_FL_TEST_LOCK;
2755 /* Save the old mode so that if the mode in the lock changes we
2756 * can decrement the appropriate reader or writer refcount. */
2757 file_lock->fl_type = einfo.ei_mode;
2760 CERROR("unknown fcntl lock command: %d\n", cmd);
2764 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2765 LUSTRE_OPC_ANY, NULL);
2766 if (IS_ERR(op_data))
2767 RETURN(PTR_ERR(op_data));
2769 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2770 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2771 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2773 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2774 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2776 if ((file_lock->fl_flags & FL_FLOCK) &&
2777 (rc == 0 || file_lock->fl_type == F_UNLCK))
2778 rc2 = flock_lock_file_wait(file, file_lock);
2779 if ((file_lock->fl_flags & FL_POSIX) &&
2780 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2781 !(flags & LDLM_FL_TEST_LOCK))
2782 rc2 = posix_lock_file_wait(file, file_lock);
2784 if (rc2 && file_lock->fl_type != F_UNLCK) {
2785 einfo.ei_mode = LCK_NL;
2786 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2787 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2791 ll_finish_md_op_data(op_data);
2796 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2804 * test if some locks matching bits and l_req_mode are acquired
2805 * - bits can be in different locks
2806 * - if found clear the common lock bits in *bits
2807 * - the bits not found, are kept in *bits
2809 * \param bits [IN] searched lock bits [IN]
2810 * \param l_req_mode [IN] searched lock mode
2811 * \retval boolean, true iff all bits are found
2813 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2815 struct lustre_handle lockh;
2816 ldlm_policy_data_t policy;
2817 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2818 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2827 fid = &ll_i2info(inode)->lli_fid;
2828 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2829 ldlm_lockname[mode]);
2831 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2832 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2833 policy.l_inodebits.bits = *bits & (1 << i);
2834 if (policy.l_inodebits.bits == 0)
2837 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2838 &policy, mode, &lockh)) {
2839 struct ldlm_lock *lock;
2841 lock = ldlm_handle2lock(&lockh);
2844 ~(lock->l_policy_data.l_inodebits.bits);
2845 LDLM_LOCK_PUT(lock);
2847 *bits &= ~policy.l_inodebits.bits;
2854 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2855 struct lustre_handle *lockh, __u64 flags)
2857 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2862 fid = &ll_i2info(inode)->lli_fid;
2863 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2865 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2866 fid, LDLM_IBITS, &policy,
2867 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2871 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2873 /* Already unlinked. Just update nlink and return success */
2874 if (rc == -ENOENT) {
2876 /* This path cannot be hit for regular files unless in
2877 * case of obscure races, so no need to to validate
2879 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2881 } else if (rc != 0) {
2882 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2883 ll_get_fsname(inode->i_sb, NULL, 0),
2884 PFID(ll_inode2fid(inode)), rc);
2890 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2893 struct inode *inode = dentry->d_inode;
2894 struct ptlrpc_request *req = NULL;
2895 struct obd_export *exp;
2899 LASSERT(inode != NULL);
2901 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2902 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2904 exp = ll_i2mdexp(inode);
2906 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2907 * But under CMD case, it caused some lock issues, should be fixed
2908 * with new CMD ibits lock. See bug 12718 */
2909 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2910 struct lookup_intent oit = { .it_op = IT_GETATTR };
2911 struct md_op_data *op_data;
2913 if (ibits == MDS_INODELOCK_LOOKUP)
2914 oit.it_op = IT_LOOKUP;
2916 /* Call getattr by fid, so do not provide name at all. */
2917 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2918 dentry->d_inode, NULL, 0, 0,
2919 LUSTRE_OPC_ANY, NULL);
2920 if (IS_ERR(op_data))
2921 RETURN(PTR_ERR(op_data));
2923 oit.it_create_mode |= M_CHECK_STALE;
2924 rc = md_intent_lock(exp, op_data, NULL, 0,
2925 /* we are not interested in name
2928 ll_md_blocking_ast, 0);
2929 ll_finish_md_op_data(op_data);
2930 oit.it_create_mode &= ~M_CHECK_STALE;
2932 rc = ll_inode_revalidate_fini(inode, rc);
2936 rc = ll_revalidate_it_finish(req, &oit, dentry);
2938 ll_intent_release(&oit);
2942 /* Unlinked? Unhash dentry, so it is not picked up later by
2943 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2944 here to preserve get_cwd functionality on 2.6.
2946 if (!dentry->d_inode->i_nlink)
2947 d_lustre_invalidate(dentry, 0);
2949 ll_lookup_finish_locks(&oit, dentry);
2950 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2951 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2952 obd_valid valid = OBD_MD_FLGETATTR;
2953 struct md_op_data *op_data;
2956 if (S_ISREG(inode->i_mode)) {
2957 rc = ll_get_max_mdsize(sbi, &ealen);
2960 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2963 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2964 0, ealen, LUSTRE_OPC_ANY,
2966 if (IS_ERR(op_data))
2967 RETURN(PTR_ERR(op_data));
2969 op_data->op_valid = valid;
2970 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2971 * capa for this inode. Because we only keep capas of dirs
2973 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2974 ll_finish_md_op_data(op_data);
2976 rc = ll_inode_revalidate_fini(inode, rc);
2980 rc = ll_prep_inode(&inode, req, NULL, NULL);
2983 ptlrpc_req_finished(req);
2987 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2990 struct inode *inode = dentry->d_inode;
2994 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2998 /* if object isn't regular file, don't validate size */
2999 if (!S_ISREG(inode->i_mode)) {
3000 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3001 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3002 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3004 /* In case of restore, the MDT has the right size and has
3005 * already send it back without granting the layout lock,
3006 * inode is up-to-date so glimpse is useless.
3007 * Also to glimpse we need the layout, in case of a running
3008 * restore the MDT holds the layout lock so the glimpse will
3009 * block up to the end of restore (getattr will block)
3011 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3012 rc = ll_glimpse_size(inode);
3017 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3018 struct lookup_intent *it, struct kstat *stat)
3020 struct inode *inode = de->d_inode;
3021 struct ll_sb_info *sbi = ll_i2sbi(inode);
3022 struct ll_inode_info *lli = ll_i2info(inode);
3025 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3026 MDS_INODELOCK_LOOKUP);
3027 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3032 stat->dev = inode->i_sb->s_dev;
3033 if (ll_need_32bit_api(sbi))
3034 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3036 stat->ino = inode->i_ino;
3037 stat->mode = inode->i_mode;
3038 stat->nlink = inode->i_nlink;
3039 stat->uid = inode->i_uid;
3040 stat->gid = inode->i_gid;
3041 stat->rdev = inode->i_rdev;
3042 stat->atime = inode->i_atime;
3043 stat->mtime = inode->i_mtime;
3044 stat->ctime = inode->i_ctime;
3045 stat->blksize = 1 << inode->i_blkbits;
3047 stat->size = i_size_read(inode);
3048 stat->blocks = inode->i_blocks;
3052 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3054 struct lookup_intent it = { .it_op = IT_GETATTR };
3056 return ll_getattr_it(mnt, de, &it, stat);
3059 #ifdef HAVE_LINUX_FIEMAP_H
3060 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3061 __u64 start, __u64 len)
3065 struct ll_user_fiemap *fiemap;
3066 unsigned int extent_count = fieinfo->fi_extents_max;
3068 num_bytes = sizeof(*fiemap) + (extent_count *
3069 sizeof(struct ll_fiemap_extent));
3070 OBD_ALLOC_LARGE(fiemap, num_bytes);
3075 fiemap->fm_flags = fieinfo->fi_flags;
3076 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3077 fiemap->fm_start = start;
3078 fiemap->fm_length = len;
3079 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3080 sizeof(struct ll_fiemap_extent));
3082 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3084 fieinfo->fi_flags = fiemap->fm_flags;
3085 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3086 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3087 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3089 OBD_FREE_LARGE(fiemap, num_bytes);
3094 struct posix_acl * ll_get_acl(struct inode *inode, int type)
3096 struct ll_inode_info *lli = ll_i2info(inode);
3097 struct posix_acl *acl = NULL;
3100 spin_lock(&lli->lli_lock);
3101 /* VFS' acl_permission_check->check_acl will release the refcount */
3102 acl = posix_acl_dup(lli->lli_posix_acl);
3103 spin_unlock(&lli->lli_lock);
3108 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3110 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3111 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3113 ll_check_acl(struct inode *inode, int mask)
3116 # ifdef CONFIG_FS_POSIX_ACL
3117 struct posix_acl *acl;
3121 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3122 if (flags & IPERM_FLAG_RCU)
3125 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3130 rc = posix_acl_permission(inode, acl, mask);
3131 posix_acl_release(acl);
3134 # else /* !CONFIG_FS_POSIX_ACL */
3136 # endif /* CONFIG_FS_POSIX_ACL */
3138 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3140 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3141 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3143 # ifdef HAVE_INODE_PERMISION_2ARGS
3144 int ll_inode_permission(struct inode *inode, int mask)
3146 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3153 #ifdef MAY_NOT_BLOCK
3154 if (mask & MAY_NOT_BLOCK)
3156 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3157 if (flags & IPERM_FLAG_RCU)
3161 /* as root inode are NOT getting validated in lookup operation,
3162 * need to do it before permission check. */
3164 if (inode == inode->i_sb->s_root->d_inode) {
3165 struct lookup_intent it = { .it_op = IT_LOOKUP };
3167 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3168 MDS_INODELOCK_LOOKUP);
3173 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3174 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3176 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3177 return lustre_check_remote_perm(inode, mask);
3179 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3180 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3185 /* -o localflock - only provides locally consistent flock locks */
3186 struct file_operations ll_file_operations = {
3187 .read = ll_file_read,
3188 .aio_read = ll_file_aio_read,
3189 .write = ll_file_write,
3190 .aio_write = ll_file_aio_write,
3191 .unlocked_ioctl = ll_file_ioctl,
3192 .open = ll_file_open,
3193 .release = ll_file_release,
3194 .mmap = ll_file_mmap,
3195 .llseek = ll_file_seek,
3196 .splice_read = ll_file_splice_read,
3201 struct file_operations ll_file_operations_flock = {
3202 .read = ll_file_read,
3203 .aio_read = ll_file_aio_read,
3204 .write = ll_file_write,
3205 .aio_write = ll_file_aio_write,
3206 .unlocked_ioctl = ll_file_ioctl,
3207 .open = ll_file_open,
3208 .release = ll_file_release,
3209 .mmap = ll_file_mmap,
3210 .llseek = ll_file_seek,
3211 .splice_read = ll_file_splice_read,
3214 .flock = ll_file_flock,
3215 .lock = ll_file_flock
3218 /* These are for -o noflock - to return ENOSYS on flock calls */
3219 struct file_operations ll_file_operations_noflock = {
3220 .read = ll_file_read,
3221 .aio_read = ll_file_aio_read,
3222 .write = ll_file_write,
3223 .aio_write = ll_file_aio_write,
3224 .unlocked_ioctl = ll_file_ioctl,
3225 .open = ll_file_open,
3226 .release = ll_file_release,
3227 .mmap = ll_file_mmap,
3228 .llseek = ll_file_seek,
3229 .splice_read = ll_file_splice_read,
3232 .flock = ll_file_noflock,
3233 .lock = ll_file_noflock
3236 struct inode_operations ll_file_inode_operations = {
3237 .setattr = ll_setattr,
3238 .getattr = ll_getattr,
3239 .permission = ll_inode_permission,
3240 .setxattr = ll_setxattr,
3241 .getxattr = ll_getxattr,
3242 .listxattr = ll_listxattr,
3243 .removexattr = ll_removexattr,
3244 #ifdef HAVE_LINUX_FIEMAP_H
3245 .fiemap = ll_fiemap,
3247 #ifdef HAVE_IOP_GET_ACL
3248 .get_acl = ll_get_acl,
3252 /* dynamic ioctl number support routins */
3253 static struct llioc_ctl_data {
3254 struct rw_semaphore ioc_sem;
3255 cfs_list_t ioc_head;
3257 __RWSEM_INITIALIZER(llioc.ioc_sem),
3258 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3263 cfs_list_t iocd_list;
3264 unsigned int iocd_size;
3265 llioc_callback_t iocd_cb;
3266 unsigned int iocd_count;
3267 unsigned int iocd_cmd[0];
3270 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3273 struct llioc_data *in_data = NULL;
3276 if (cb == NULL || cmd == NULL ||
3277 count > LLIOC_MAX_CMD || count < 0)
3280 size = sizeof(*in_data) + count * sizeof(unsigned int);
3281 OBD_ALLOC(in_data, size);
3282 if (in_data == NULL)
3285 memset(in_data, 0, sizeof(*in_data));
3286 in_data->iocd_size = size;
3287 in_data->iocd_cb = cb;
3288 in_data->iocd_count = count;
3289 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3291 down_write(&llioc.ioc_sem);
3292 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3293 up_write(&llioc.ioc_sem);
3298 void ll_iocontrol_unregister(void *magic)
3300 struct llioc_data *tmp;
3305 down_write(&llioc.ioc_sem);
3306 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3308 unsigned int size = tmp->iocd_size;
3310 cfs_list_del(&tmp->iocd_list);
3311 up_write(&llioc.ioc_sem);
3313 OBD_FREE(tmp, size);
3317 up_write(&llioc.ioc_sem);
3319 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3322 EXPORT_SYMBOL(ll_iocontrol_register);
3323 EXPORT_SYMBOL(ll_iocontrol_unregister);
3325 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3326 unsigned int cmd, unsigned long arg, int *rcp)
3328 enum llioc_iter ret = LLIOC_CONT;
3329 struct llioc_data *data;
3330 int rc = -EINVAL, i;
3332 down_read(&llioc.ioc_sem);
3333 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3334 for (i = 0; i < data->iocd_count; i++) {
3335 if (cmd != data->iocd_cmd[i])
3338 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3342 if (ret == LLIOC_STOP)
3345 up_read(&llioc.ioc_sem);
3352 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3354 struct ll_inode_info *lli = ll_i2info(inode);
3355 struct cl_env_nest nest;
3360 if (lli->lli_clob == NULL)
3363 env = cl_env_nested_get(&nest);
3365 RETURN(PTR_ERR(env));
3367 result = cl_conf_set(env, lli->lli_clob, conf);
3368 cl_env_nested_put(&nest, env);
3370 if (conf->coc_opc == OBJECT_CONF_SET) {
3371 struct ldlm_lock *lock = conf->coc_lock;
3373 LASSERT(lock != NULL);
3374 LASSERT(ldlm_has_layout(lock));
3376 /* it can only be allowed to match after layout is
3377 * applied to inode otherwise false layout would be
3378 * seen. Applying layout shoud happen before dropping
3379 * the intent lock. */
3380 ldlm_lock_allow_match(lock);
3386 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3387 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3390 struct ll_sb_info *sbi = ll_i2sbi(inode);
3391 struct obd_capa *oc;
3392 struct ptlrpc_request *req;
3393 struct mdt_body *body;
3400 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3401 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3402 lock->l_lvb_data, lock->l_lvb_len);
3404 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3407 /* if layout lock was granted right away, the layout is returned
3408 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3409 * blocked and then granted via completion ast, we have to fetch
3410 * layout here. Please note that we can't use the LVB buffer in
3411 * completion AST because it doesn't have a large enough buffer */
3412 oc = ll_mdscapa_get(inode);
3413 rc = ll_get_max_mdsize(sbi, &lmmsize);
3415 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3416 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3422 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3423 if (body == NULL || body->eadatasize > lmmsize)
3424 GOTO(out, rc = -EPROTO);
3426 lmmsize = body->eadatasize;
3427 if (lmmsize == 0) /* empty layout */
3430 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3432 GOTO(out, rc = -EFAULT);
3434 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3435 if (lvbdata == NULL)
3436 GOTO(out, rc = -ENOMEM);
3438 memcpy(lvbdata, lmm, lmmsize);
3439 lock_res_and_lock(lock);
3440 if (lock->l_lvb_data != NULL)
3441 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3443 lock->l_lvb_data = lvbdata;
3444 lock->l_lvb_len = lmmsize;
3445 unlock_res_and_lock(lock);
3450 ptlrpc_req_finished(req);
3455 * Apply the layout to the inode. Layout lock is held and will be released
3458 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3459 struct inode *inode, __u32 *gen, bool reconf)
3461 struct ll_inode_info *lli = ll_i2info(inode);
3462 struct ll_sb_info *sbi = ll_i2sbi(inode);
3463 struct ldlm_lock *lock;
3464 struct lustre_md md = { NULL };
3465 struct cl_object_conf conf;
3468 bool wait_layout = false;
3471 LASSERT(lustre_handle_is_used(lockh));
3473 lock = ldlm_handle2lock(lockh);
3474 LASSERT(lock != NULL);
3475 LASSERT(ldlm_has_layout(lock));
3477 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3478 inode, PFID(&lli->lli_fid), reconf);
3480 /* in case this is a caching lock and reinstate with new inode */
3481 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3483 lock_res_and_lock(lock);
3484 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3485 unlock_res_and_lock(lock);
3486 /* checking lvb_ready is racy but this is okay. The worst case is
3487 * that multi processes may configure the file on the same time. */
3489 if (lvb_ready || !reconf) {
3492 /* layout_gen must be valid if layout lock is not
3493 * cancelled and stripe has already set */
3494 *gen = lli->lli_layout_gen;
3500 rc = ll_layout_fetch(inode, lock);
3504 /* for layout lock, lmm is returned in lock's lvb.
3505 * lvb_data is immutable if the lock is held so it's safe to access it
3506 * without res lock. See the description in ldlm_lock_decref_internal()
3507 * for the condition to free lvb_data of layout lock */
3508 if (lock->l_lvb_data != NULL) {
3509 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3510 lock->l_lvb_data, lock->l_lvb_len);
3512 *gen = LL_LAYOUT_GEN_EMPTY;
3514 *gen = md.lsm->lsm_layout_gen;
3517 CERROR("%s: file "DFID" unpackmd error: %d\n",
3518 ll_get_fsname(inode->i_sb, NULL, 0),
3519 PFID(&lli->lli_fid), rc);
3525 /* set layout to file. Unlikely this will fail as old layout was
3526 * surely eliminated */
3527 memset(&conf, 0, sizeof conf);
3528 conf.coc_opc = OBJECT_CONF_SET;
3529 conf.coc_inode = inode;
3530 conf.coc_lock = lock;
3531 conf.u.coc_md = &md;
3532 rc = ll_layout_conf(inode, &conf);
3535 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3537 /* refresh layout failed, need to wait */
3538 wait_layout = rc == -EBUSY;
3542 LDLM_LOCK_PUT(lock);
3543 ldlm_lock_decref(lockh, mode);
3545 /* wait for IO to complete if it's still being used. */
3547 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3548 ll_get_fsname(inode->i_sb, NULL, 0),
3549 inode, PFID(&lli->lli_fid));
3551 memset(&conf, 0, sizeof conf);
3552 conf.coc_opc = OBJECT_CONF_WAIT;
3553 conf.coc_inode = inode;
3554 rc = ll_layout_conf(inode, &conf);
3558 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3559 PFID(&lli->lli_fid), rc);
3565 * This function checks if there exists a LAYOUT lock on the client side,
3566 * or enqueues it if it doesn't have one in cache.
3568 * This function will not hold layout lock so it may be revoked any time after
3569 * this function returns. Any operations depend on layout should be redone
3572 * This function should be called before lov_io_init() to get an uptodate
3573 * layout version, the caller should save the version number and after IO
3574 * is finished, this function should be called again to verify that layout
3575 * is not changed during IO time.
3577 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3579 struct ll_inode_info *lli = ll_i2info(inode);
3580 struct ll_sb_info *sbi = ll_i2sbi(inode);
3581 struct md_op_data *op_data;
3582 struct lookup_intent it;
3583 struct lustre_handle lockh;
3585 struct ldlm_enqueue_info einfo = {
3586 .ei_type = LDLM_IBITS,
3588 .ei_cb_bl = ll_md_blocking_ast,
3589 .ei_cb_cp = ldlm_completion_ast,
3594 *gen = lli->lli_layout_gen;
3595 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3599 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3600 LASSERT(S_ISREG(inode->i_mode));
3602 /* mostly layout lock is caching on the local side, so try to match
3603 * it before grabbing layout lock mutex. */
3604 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3605 if (mode != 0) { /* hit cached lock */
3606 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3610 /* better hold lli_layout_mutex to try again otherwise
3611 * it will have starvation problem. */
3614 /* take layout lock mutex to enqueue layout lock exclusively. */
3615 mutex_lock(&lli->lli_layout_mutex);
3618 /* try again. Maybe somebody else has done this. */
3619 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3620 if (mode != 0) { /* hit cached lock */
3621 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3625 mutex_unlock(&lli->lli_layout_mutex);
3629 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3630 0, 0, LUSTRE_OPC_ANY, NULL);
3631 if (IS_ERR(op_data)) {
3632 mutex_unlock(&lli->lli_layout_mutex);
3633 RETURN(PTR_ERR(op_data));
3636 /* have to enqueue one */
3637 memset(&it, 0, sizeof(it));
3638 it.it_op = IT_LAYOUT;
3639 lockh.cookie = 0ULL;
3641 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3642 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3643 PFID(&lli->lli_fid));
3645 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3647 if (it.d.lustre.it_data != NULL)
3648 ptlrpc_req_finished(it.d.lustre.it_data);
3649 it.d.lustre.it_data = NULL;
3651 ll_finish_md_op_data(op_data);
3653 mode = it.d.lustre.it_lock_mode;
3654 it.d.lustre.it_lock_mode = 0;
3655 ll_intent_drop_lock(&it);
3658 /* set lock data in case this is a new lock */
3659 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3660 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3664 mutex_unlock(&lli->lli_layout_mutex);
3670 * This function send a restore request to the MDT
3672 int ll_layout_restore(struct inode *inode)
3674 struct hsm_user_request *hur;
3678 len = sizeof(struct hsm_user_request) +
3679 sizeof(struct hsm_user_item);
3680 OBD_ALLOC(hur, len);
3684 hur->hur_request.hr_action = HUA_RESTORE;
3685 hur->hur_request.hr_archive_id = 0;
3686 hur->hur_request.hr_flags = 0;
3687 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3688 sizeof(hur->hur_user_item[0].hui_fid));
3689 hur->hur_user_item[0].hui_extent.length = -1;
3690 hur->hur_request.hr_itemcount = 1;
3691 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,