4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och)
125 struct obd_export *exp = ll_i2mdexp(inode);
126 struct md_op_data *op_data;
127 struct ptlrpc_request *req = NULL;
128 struct obd_device *obd = class_exp2obd(exp);
135 * XXX: in case of LMV, is this correct to access
138 CERROR("Invalid MDC connection handle "LPX64"\n",
139 ll_i2mdexp(inode)->exp_handle.h_cookie);
143 OBD_ALLOC_PTR(op_data);
145 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
147 ll_prepare_close(inode, op_data, och);
148 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
149 rc = md_close(md_exp, op_data, och->och_mod, &req);
151 /* This close must have the epoch closed. */
152 LASSERT(epoch_close);
153 /* MDS has instructed us to obtain Size-on-MDS attribute from
154 * OSTs and send setattr to back to MDS. */
155 rc = ll_som_update(inode, op_data);
157 CERROR("inode %lu mdc Size-on-MDS update failed: "
158 "rc = %d\n", inode->i_ino, rc);
162 CERROR("inode %lu mdc close failed: rc = %d\n",
166 /* DATA_MODIFIED flag was successfully sent on close, cancel data
167 * modification flag. */
168 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
169 struct ll_inode_info *lli = ll_i2info(inode);
171 spin_lock(&lli->lli_lock);
172 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
173 spin_unlock(&lli->lli_lock);
176 ll_finish_md_op_data(op_data);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
188 if (exp_connect_som(exp) && !epoch_close &&
189 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
190 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
192 md_clear_open_replay_data(md_exp, och);
193 /* Free @och if it is not waiting for DONE_WRITING. */
194 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
197 if (req) /* This is close request */
198 ptlrpc_req_finished(req);
202 int ll_md_real_close(struct inode *inode, int flags)
204 struct ll_inode_info *lli = ll_i2info(inode);
205 struct obd_client_handle **och_p;
206 struct obd_client_handle *och;
211 if (flags & FMODE_WRITE) {
212 och_p = &lli->lli_mds_write_och;
213 och_usecount = &lli->lli_open_fd_write_count;
214 } else if (flags & FMODE_EXEC) {
215 och_p = &lli->lli_mds_exec_och;
216 och_usecount = &lli->lli_open_fd_exec_count;
218 LASSERT(flags & FMODE_READ);
219 och_p = &lli->lli_mds_read_och;
220 och_usecount = &lli->lli_open_fd_read_count;
223 mutex_lock(&lli->lli_och_mutex);
224 if (*och_usecount) { /* There are still users of this handle, so
226 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
233 if (och) { /* There might be a race and somebody have freed this och
235 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
242 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
254 if (fd->fd_lease_och != NULL) {
257 /* Usually the lease is not released when the
258 * application crashed, we need to release here. */
259 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
260 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
261 PFID(&lli->lli_fid), rc, lease_broken);
263 fd->fd_lease_och = NULL;
266 if (fd->fd_och != NULL) {
267 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och);
272 /* Let's see if we have good enough OPEN lock on the file and if
273 we can skip talking to MDS */
274 if (file->f_dentry->d_inode) { /* Can this ever be false? */
276 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
277 struct lustre_handle lockh;
278 struct inode *inode = file->f_dentry->d_inode;
279 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
281 mutex_lock(&lli->lli_och_mutex);
282 if (fd->fd_omode & FMODE_WRITE) {
284 LASSERT(lli->lli_open_fd_write_count);
285 lli->lli_open_fd_write_count--;
286 } else if (fd->fd_omode & FMODE_EXEC) {
288 LASSERT(lli->lli_open_fd_exec_count);
289 lli->lli_open_fd_exec_count--;
292 LASSERT(lli->lli_open_fd_read_count);
293 lli->lli_open_fd_read_count--;
295 mutex_unlock(&lli->lli_och_mutex);
297 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
298 LDLM_IBITS, &policy, lockmode,
300 rc = ll_md_real_close(file->f_dentry->d_inode,
304 CERROR("Releasing a file %p with negative dentry %p. Name %s",
305 file, file->f_dentry, file->f_dentry->d_name.name);
309 LUSTRE_FPRIVATE(file) = NULL;
310 ll_file_data_put(fd);
311 ll_capa_close(inode);
316 /* While this returns an error code, fput() the caller does not, so we need
317 * to make every effort to clean up all of our state here. Also, applications
318 * rarely check close errors and even if an error is returned they will not
319 * re-try the close call.
321 int ll_file_release(struct inode *inode, struct file *file)
323 struct ll_file_data *fd;
324 struct ll_sb_info *sbi = ll_i2sbi(inode);
325 struct ll_inode_info *lli = ll_i2info(inode);
329 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
330 inode->i_generation, inode);
332 #ifdef CONFIG_FS_POSIX_ACL
333 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
334 inode == inode->i_sb->s_root->d_inode) {
335 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
338 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
339 fd->fd_flags &= ~LL_FILE_RMTACL;
340 rct_del(&sbi->ll_rct, cfs_curproc_pid());
341 et_search_free(&sbi->ll_et, cfs_curproc_pid());
346 if (inode->i_sb->s_root != file->f_dentry)
347 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
348 fd = LUSTRE_FPRIVATE(file);
351 /* The last ref on @file, maybe not the the owner pid of statahead.
352 * Different processes can open the same dir, "ll_opendir_key" means:
353 * it is me that should stop the statahead thread. */
354 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
355 lli->lli_opendir_pid != 0)
356 ll_stop_statahead(inode, lli->lli_opendir_key);
358 if (inode->i_sb->s_root == file->f_dentry) {
359 LUSTRE_FPRIVATE(file) = NULL;
360 ll_file_data_put(fd);
364 if (!S_ISDIR(inode->i_mode)) {
365 lov_read_and_clear_async_rc(lli->lli_clob);
366 lli->lli_async_rc = 0;
369 rc = ll_md_close(sbi->ll_md_exp, inode, file);
371 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
372 libcfs_debug_dumplog();
377 static int ll_intent_file_open(struct file *file, void *lmm,
378 int lmmsize, struct lookup_intent *itp)
380 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
381 struct dentry *parent = file->f_dentry->d_parent;
382 const char *name = file->f_dentry->d_name.name;
383 const int len = file->f_dentry->d_name.len;
384 struct md_op_data *op_data;
385 struct ptlrpc_request *req;
386 __u32 opc = LUSTRE_OPC_ANY;
393 /* Usually we come here only for NFSD, and we want open lock.
394 But we can also get here with pre 2.6.15 patchless kernels, and in
395 that case that lock is also ok */
396 /* We can also get here if there was cached open handle in revalidate_it
397 * but it disappeared while we were getting from there to ll_file_open.
398 * But this means this file was closed and immediatelly opened which
399 * makes a good candidate for using OPEN lock */
400 /* If lmmsize & lmm are not 0, we are just setting stripe info
401 * parameters. No need for the open lock */
402 if (lmm == NULL && lmmsize == 0) {
403 itp->it_flags |= MDS_OPEN_LOCK;
404 if (itp->it_flags & FMODE_WRITE)
405 opc = LUSTRE_OPC_CREATE;
408 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
409 file->f_dentry->d_inode, name, len,
412 RETURN(PTR_ERR(op_data));
414 itp->it_flags |= MDS_OPEN_BY_FID;
415 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
416 0 /*unused */, &req, ll_md_blocking_ast, 0);
417 ll_finish_md_op_data(op_data);
419 /* reason for keep own exit path - don`t flood log
420 * with messages with -ESTALE errors.
422 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
423 it_open_error(DISP_OPEN_OPEN, itp))
425 ll_release_openhandle(file->f_dentry, itp);
429 if (it_disposition(itp, DISP_LOOKUP_NEG))
430 GOTO(out, rc = -ENOENT);
432 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
433 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
434 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
438 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
439 if (!rc && itp->d.lustre.it_lock_mode)
440 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
444 ptlrpc_req_finished(itp->d.lustre.it_data);
445 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
446 ll_intent_drop_lock(itp);
452 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
453 * not believe attributes if a few ioepoch holders exist. Attributes for
454 * previous ioepoch if new one is opened are also skipped by MDS.
456 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
458 if (ioepoch && lli->lli_ioepoch != ioepoch) {
459 lli->lli_ioepoch = ioepoch;
460 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
461 ioepoch, PFID(&lli->lli_fid));
465 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
466 struct obd_client_handle *och)
468 struct ptlrpc_request *req = it->d.lustre.it_data;
469 struct mdt_body *body;
471 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
472 och->och_fh = body->handle;
473 och->och_fid = body->fid1;
474 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
475 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
476 och->och_flags = it->it_flags;
478 return md_set_open_replay_data(md_exp, och, req);
481 int ll_local_open(struct file *file, struct lookup_intent *it,
482 struct ll_file_data *fd, struct obd_client_handle *och)
484 struct inode *inode = file->f_dentry->d_inode;
485 struct ll_inode_info *lli = ll_i2info(inode);
488 LASSERT(!LUSTRE_FPRIVATE(file));
493 struct ptlrpc_request *req = it->d.lustre.it_data;
494 struct mdt_body *body;
497 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
501 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
502 ll_ioepoch_open(lli, body->ioepoch);
505 LUSTRE_FPRIVATE(file) = fd;
506 ll_readahead_init(inode, &fd->fd_ras);
507 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
512 /* Open a file, and (for the very first open) create objects on the OSTs at
513 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
514 * creation or open until ll_lov_setstripe() ioctl is called.
516 * If we already have the stripe MD locally then we don't request it in
517 * md_open(), by passing a lmm_size = 0.
519 * It is up to the application to ensure no other processes open this file
520 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
521 * used. We might be able to avoid races of that sort by getting lli_open_sem
522 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
523 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
525 int ll_file_open(struct inode *inode, struct file *file)
527 struct ll_inode_info *lli = ll_i2info(inode);
528 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
529 .it_flags = file->f_flags };
530 struct obd_client_handle **och_p = NULL;
531 __u64 *och_usecount = NULL;
532 struct ll_file_data *fd;
533 int rc = 0, opendir_set = 0;
536 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
537 inode->i_generation, inode, file->f_flags);
539 it = file->private_data; /* XXX: compat macro */
540 file->private_data = NULL; /* prevent ll_local_open assertion */
542 fd = ll_file_data_get();
544 GOTO(out_openerr, rc = -ENOMEM);
547 if (S_ISDIR(inode->i_mode)) {
548 spin_lock(&lli->lli_sa_lock);
549 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
550 lli->lli_opendir_pid == 0) {
551 lli->lli_opendir_key = fd;
552 lli->lli_opendir_pid = cfs_curproc_pid();
555 spin_unlock(&lli->lli_sa_lock);
558 if (inode->i_sb->s_root == file->f_dentry) {
559 LUSTRE_FPRIVATE(file) = fd;
563 if (!it || !it->d.lustre.it_disposition) {
564 /* Convert f_flags into access mode. We cannot use file->f_mode,
565 * because everything but O_ACCMODE mask was stripped from
567 if ((oit.it_flags + 1) & O_ACCMODE)
569 if (file->f_flags & O_TRUNC)
570 oit.it_flags |= FMODE_WRITE;
572 /* kernel only call f_op->open in dentry_open. filp_open calls
573 * dentry_open after call to open_namei that checks permissions.
574 * Only nfsd_open call dentry_open directly without checking
575 * permissions and because of that this code below is safe. */
576 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
577 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
579 /* We do not want O_EXCL here, presumably we opened the file
580 * already? XXX - NFS implications? */
581 oit.it_flags &= ~O_EXCL;
583 /* bug20584, if "it_flags" contains O_CREAT, the file will be
584 * created if necessary, then "IT_CREAT" should be set to keep
585 * consistent with it */
586 if (oit.it_flags & O_CREAT)
587 oit.it_op |= IT_CREAT;
593 /* Let's see if we have file open on MDS already. */
594 if (it->it_flags & FMODE_WRITE) {
595 och_p = &lli->lli_mds_write_och;
596 och_usecount = &lli->lli_open_fd_write_count;
597 } else if (it->it_flags & FMODE_EXEC) {
598 och_p = &lli->lli_mds_exec_och;
599 och_usecount = &lli->lli_open_fd_exec_count;
601 och_p = &lli->lli_mds_read_och;
602 och_usecount = &lli->lli_open_fd_read_count;
605 mutex_lock(&lli->lli_och_mutex);
606 if (*och_p) { /* Open handle is present */
607 if (it_disposition(it, DISP_OPEN_OPEN)) {
608 /* Well, there's extra open request that we do not need,
609 let's close it somehow. This will decref request. */
610 rc = it_open_error(DISP_OPEN_OPEN, it);
612 mutex_unlock(&lli->lli_och_mutex);
613 GOTO(out_openerr, rc);
616 ll_release_openhandle(file->f_dentry, it);
620 rc = ll_local_open(file, it, fd, NULL);
623 mutex_unlock(&lli->lli_och_mutex);
624 GOTO(out_openerr, rc);
627 LASSERT(*och_usecount == 0);
628 if (!it->d.lustre.it_disposition) {
629 /* We cannot just request lock handle now, new ELC code
630 means that one of other OPEN locks for this file
631 could be cancelled, and since blocking ast handler
632 would attempt to grab och_mutex as well, that would
633 result in a deadlock */
634 mutex_unlock(&lli->lli_och_mutex);
635 it->it_create_mode |= M_CHECK_STALE;
636 rc = ll_intent_file_open(file, NULL, 0, it);
637 it->it_create_mode &= ~M_CHECK_STALE;
639 GOTO(out_openerr, rc);
643 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
645 GOTO(out_och_free, rc = -ENOMEM);
649 /* md_intent_lock() didn't get a request ref if there was an
650 * open error, so don't do cleanup on the request here
652 /* XXX (green): Should not we bail out on any error here, not
653 * just open error? */
654 rc = it_open_error(DISP_OPEN_OPEN, it);
656 GOTO(out_och_free, rc);
658 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
660 rc = ll_local_open(file, it, fd, *och_p);
662 GOTO(out_och_free, rc);
664 mutex_unlock(&lli->lli_och_mutex);
667 /* Must do this outside lli_och_mutex lock to prevent deadlock where
668 different kind of OPEN lock for this same inode gets cancelled
669 by ldlm_cancel_lru */
670 if (!S_ISREG(inode->i_mode))
671 GOTO(out_och_free, rc);
675 if (!lli->lli_has_smd) {
676 if (file->f_flags & O_LOV_DELAY_CREATE ||
677 !(file->f_mode & FMODE_WRITE)) {
678 CDEBUG(D_INODE, "object creation was delayed\n");
679 GOTO(out_och_free, rc);
682 file->f_flags &= ~O_LOV_DELAY_CREATE;
683 GOTO(out_och_free, rc);
687 if (och_p && *och_p) {
688 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
689 *och_p = NULL; /* OBD_FREE writes some magic there */
692 mutex_unlock(&lli->lli_och_mutex);
695 if (opendir_set != 0)
696 ll_stop_statahead(inode, lli->lli_opendir_key);
698 ll_file_data_put(fd);
700 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
703 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
704 ptlrpc_req_finished(it->d.lustre.it_data);
705 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
711 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
712 struct ldlm_lock_desc *desc, void *data, int flag)
715 struct lustre_handle lockh;
719 case LDLM_CB_BLOCKING:
720 ldlm_lock2handle(lock, &lockh);
721 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
723 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
727 case LDLM_CB_CANCELING:
735 * Acquire a lease and open the file.
737 struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
740 struct lookup_intent it = { .it_op = IT_OPEN };
741 struct ll_sb_info *sbi = ll_i2sbi(inode);
742 struct md_op_data *op_data;
743 struct ptlrpc_request *req;
744 struct lustre_handle old_handle = { 0 };
745 struct obd_client_handle *och = NULL;
750 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
751 RETURN(ERR_PTR(-EINVAL));
754 struct ll_inode_info *lli = ll_i2info(inode);
755 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
756 struct obd_client_handle **och_p;
759 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
760 RETURN(ERR_PTR(-EPERM));
762 /* Get the openhandle of the file */
764 mutex_lock(&lli->lli_och_mutex);
765 if (fd->fd_lease_och != NULL) {
766 mutex_unlock(&lli->lli_och_mutex);
770 if (fd->fd_och == NULL) {
771 if (file->f_mode & FMODE_WRITE) {
772 LASSERT(lli->lli_mds_write_och != NULL);
773 och_p = &lli->lli_mds_write_och;
774 och_usecount = &lli->lli_open_fd_write_count;
776 LASSERT(lli->lli_mds_read_och != NULL);
777 och_p = &lli->lli_mds_read_och;
778 och_usecount = &lli->lli_open_fd_read_count;
780 if (*och_usecount == 1) {
787 mutex_unlock(&lli->lli_och_mutex);
788 if (rc < 0) /* more than 1 opener */
791 LASSERT(fd->fd_och != NULL);
792 old_handle = fd->fd_och->och_fh;
797 RETURN(ERR_PTR(-ENOMEM));
799 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
800 LUSTRE_OPC_ANY, NULL);
802 GOTO(out, rc = PTR_ERR(op_data));
804 /* To tell the MDT this openhandle is from the same owner */
805 op_data->op_handle = old_handle;
807 it.it_flags = fmode | MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
808 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
809 ll_md_blocking_lease_ast,
810 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
811 * it can be cancelled which may mislead applications that the lease is
813 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
814 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
815 * doesn't deal with openhandle, so normal openhandle will be leaked. */
816 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
817 ll_finish_md_op_data(op_data);
819 ptlrpc_req_finished(req);
820 it_clear_disposition(&it, DISP_ENQ_COMPLETE);
823 GOTO(out_release_it, rc);
825 if (it_disposition(&it, DISP_LOOKUP_NEG))
826 GOTO(out_release_it, rc = -ENOENT);
828 rc = it_open_error(DISP_OPEN_OPEN, &it);
830 GOTO(out_release_it, rc);
832 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
833 ll_och_fill(sbi->ll_md_exp, &it, och);
835 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
836 GOTO(out_close, rc = -EOPNOTSUPP);
838 /* already get lease, handle lease lock */
839 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
840 if (it.d.lustre.it_lock_mode == 0 ||
841 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
842 /* open lock must return for lease */
843 CERROR(DFID "lease granted but no open lock, %d/%Lu.\n",
844 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
845 it.d.lustre.it_lock_bits);
846 GOTO(out_close, rc = -EPROTO);
849 ll_intent_release(&it);
853 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och);
855 CERROR("Close openhandle returned %d\n", rc2);
857 /* cancel open lock */
858 if (it.d.lustre.it_lock_mode != 0) {
859 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
860 it.d.lustre.it_lock_mode);
861 it.d.lustre.it_lock_mode = 0;
864 ll_intent_release(&it);
869 EXPORT_SYMBOL(ll_lease_open);
872 * Release lease and close the file.
873 * It will check if the lease has ever broken.
875 int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
878 struct ldlm_lock *lock;
879 bool cancelled = true;
883 lock = ldlm_handle2lock(&och->och_lease_handle);
885 lock_res_and_lock(lock);
886 cancelled = ldlm_is_cancel(lock);
887 unlock_res_and_lock(lock);
891 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
892 PFID(&ll_i2info(inode)->lli_fid), cancelled);
895 ldlm_cli_cancel(&och->och_lease_handle, 0);
896 if (lease_broken != NULL)
897 *lease_broken = cancelled;
899 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och);
902 EXPORT_SYMBOL(ll_lease_close);
904 /* Fills the obdo with the attributes for the lsm */
905 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
906 struct obd_capa *capa, struct obdo *obdo,
907 __u64 ioepoch, int sync)
909 struct ptlrpc_request_set *set;
910 struct obd_info oinfo = { { { 0 } } };
915 LASSERT(lsm != NULL);
919 oinfo.oi_oa->o_oi = lsm->lsm_oi;
920 oinfo.oi_oa->o_mode = S_IFREG;
921 oinfo.oi_oa->o_ioepoch = ioepoch;
922 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
923 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
924 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
925 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
926 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
927 OBD_MD_FLDATAVERSION;
928 oinfo.oi_capa = capa;
930 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
931 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
934 set = ptlrpc_prep_set();
936 CERROR("can't allocate ptlrpc set\n");
939 rc = obd_getattr_async(exp, &oinfo, set);
941 rc = ptlrpc_set_wait(set);
942 ptlrpc_set_destroy(set);
945 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
946 OBD_MD_FLATIME | OBD_MD_FLMTIME |
947 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
948 OBD_MD_FLDATAVERSION);
953 * Performs the getattr on the inode and updates its fields.
954 * If @sync != 0, perform the getattr under the server-side lock.
956 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
957 __u64 ioepoch, int sync)
959 struct obd_capa *capa = ll_mdscapa_get(inode);
960 struct lov_stripe_md *lsm;
964 lsm = ccc_inode_lsm_get(inode);
965 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
966 capa, obdo, ioepoch, sync);
969 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
971 obdo_refresh_inode(inode, obdo, obdo->o_valid);
972 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
973 " blksize %lu\n", POSTID(oi), i_size_read(inode),
974 (unsigned long long)inode->i_blocks,
975 (unsigned long)ll_inode_blksize(inode));
977 ccc_inode_lsm_put(inode, lsm);
981 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
983 struct ll_inode_info *lli = ll_i2info(inode);
984 struct cl_object *obj = lli->lli_clob;
985 struct cl_attr *attr = ccc_env_thread_attr(env);
991 ll_inode_size_lock(inode);
992 /* merge timestamps the most recently obtained from mds with
993 timestamps obtained from osts */
994 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
995 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
996 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
997 inode_init_lvb(inode, &lvb);
999 cl_object_attr_lock(obj);
1000 rc = cl_object_attr_get(env, obj, attr);
1001 cl_object_attr_unlock(obj);
1004 if (lvb.lvb_atime < attr->cat_atime)
1005 lvb.lvb_atime = attr->cat_atime;
1006 if (lvb.lvb_ctime < attr->cat_ctime)
1007 lvb.lvb_ctime = attr->cat_ctime;
1008 if (lvb.lvb_mtime < attr->cat_mtime)
1009 lvb.lvb_mtime = attr->cat_mtime;
1011 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1012 PFID(&lli->lli_fid), attr->cat_size);
1013 cl_isize_write_nolock(inode, attr->cat_size);
1015 inode->i_blocks = attr->cat_blocks;
1017 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1018 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1019 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1021 ll_inode_size_unlock(inode);
1026 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1029 struct obdo obdo = { 0 };
1032 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1034 st->st_size = obdo.o_size;
1035 st->st_blocks = obdo.o_blocks;
1036 st->st_mtime = obdo.o_mtime;
1037 st->st_atime = obdo.o_atime;
1038 st->st_ctime = obdo.o_ctime;
1043 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1045 struct inode *inode = file->f_dentry->d_inode;
1047 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1049 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1050 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1051 file->f_flags & O_DIRECT ||
1054 io->ci_obj = ll_i2info(inode)->lli_clob;
1055 io->ci_lockreq = CILR_MAYBE;
1056 if (ll_file_nolock(file)) {
1057 io->ci_lockreq = CILR_NEVER;
1058 io->ci_no_srvlock = 1;
1059 } else if (file->f_flags & O_APPEND) {
1060 io->ci_lockreq = CILR_MANDATORY;
1065 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1066 struct file *file, enum cl_io_type iot,
1067 loff_t *ppos, size_t count)
1069 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1070 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1076 io = ccc_env_thread_io(env);
1077 ll_io_init(io, file, iot == CIT_WRITE);
1079 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1080 struct vvp_io *vio = vvp_env_io(env);
1081 struct ccc_io *cio = ccc_env_io(env);
1082 int write_mutex_locked = 0;
1084 cio->cui_fd = LUSTRE_FPRIVATE(file);
1085 vio->cui_io_subtype = args->via_io_subtype;
1087 switch (vio->cui_io_subtype) {
1089 cio->cui_iov = args->u.normal.via_iov;
1090 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1091 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1092 #ifndef HAVE_FILE_WRITEV
1093 cio->cui_iocb = args->u.normal.via_iocb;
1095 if ((iot == CIT_WRITE) &&
1096 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1097 if (mutex_lock_interruptible(&lli->
1099 GOTO(out, result = -ERESTARTSYS);
1100 write_mutex_locked = 1;
1101 } else if (iot == CIT_READ) {
1102 down_read(&lli->lli_trunc_sem);
1106 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1107 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1110 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1111 vio->u.splice.cui_flags = args->u.splice.via_flags;
1114 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1117 result = cl_io_loop(env, io);
1118 if (write_mutex_locked)
1119 mutex_unlock(&lli->lli_write_mutex);
1120 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1121 up_read(&lli->lli_trunc_sem);
1123 /* cl_io_rw_init() handled IO */
1124 result = io->ci_result;
1127 if (io->ci_nob > 0) {
1128 result = io->ci_nob;
1129 *ppos = io->u.ci_wr.wr.crw_pos;
1133 cl_io_fini(env, io);
1134 /* If any bit been read/written (result != 0), we just return
1135 * short read/write instead of restart io. */
1136 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1137 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1138 iot == CIT_READ ? "read" : "write",
1139 file->f_dentry->d_name.name, *ppos, count);
1140 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1144 if (iot == CIT_READ) {
1146 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1147 LPROC_LL_READ_BYTES, result);
1148 } else if (iot == CIT_WRITE) {
1150 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1151 LPROC_LL_WRITE_BYTES, result);
1152 fd->fd_write_failed = false;
1153 } else if (result != -ERESTARTSYS) {
1154 fd->fd_write_failed = true;
1163 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1165 static int ll_file_get_iov_count(const struct iovec *iov,
1166 unsigned long *nr_segs, size_t *count)
1171 for (seg = 0; seg < *nr_segs; seg++) {
1172 const struct iovec *iv = &iov[seg];
1175 * If any segment has a negative length, or the cumulative
1176 * length ever wraps negative then return -EINVAL.
1179 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1181 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1186 cnt -= iv->iov_len; /* This segment is no good */
1193 #ifdef HAVE_FILE_READV
1194 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
1195 unsigned long nr_segs, loff_t *ppos)
1198 struct vvp_io_args *args;
1204 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1208 env = cl_env_get(&refcheck);
1210 RETURN(PTR_ERR(env));
1212 args = vvp_env_args(env, IO_NORMAL);
1213 args->u.normal.via_iov = (struct iovec *)iov;
1214 args->u.normal.via_nrsegs = nr_segs;
1216 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1217 cl_env_put(env, &refcheck);
1221 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1225 struct iovec *local_iov;
1230 env = cl_env_get(&refcheck);
1232 RETURN(PTR_ERR(env));
1234 local_iov = &vvp_env_info(env)->vti_local_iov;
1235 local_iov->iov_base = (void __user *)buf;
1236 local_iov->iov_len = count;
1237 result = ll_file_readv(file, local_iov, 1, ppos);
1238 cl_env_put(env, &refcheck);
1243 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1244 unsigned long nr_segs, loff_t pos)
1247 struct vvp_io_args *args;
1253 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1257 env = cl_env_get(&refcheck);
1259 RETURN(PTR_ERR(env));
1261 args = vvp_env_args(env, IO_NORMAL);
1262 args->u.normal.via_iov = (struct iovec *)iov;
1263 args->u.normal.via_nrsegs = nr_segs;
1264 args->u.normal.via_iocb = iocb;
1266 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1267 &iocb->ki_pos, count);
1268 cl_env_put(env, &refcheck);
1272 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1276 struct iovec *local_iov;
1277 struct kiocb *kiocb;
1282 env = cl_env_get(&refcheck);
1284 RETURN(PTR_ERR(env));
1286 local_iov = &vvp_env_info(env)->vti_local_iov;
1287 kiocb = &vvp_env_info(env)->vti_kiocb;
1288 local_iov->iov_base = (void __user *)buf;
1289 local_iov->iov_len = count;
1290 init_sync_kiocb(kiocb, file);
1291 kiocb->ki_pos = *ppos;
1292 kiocb->ki_left = count;
1294 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1295 *ppos = kiocb->ki_pos;
1297 cl_env_put(env, &refcheck);
1303 * Write to a file (through the page cache).
1305 #ifdef HAVE_FILE_WRITEV
1306 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1307 unsigned long nr_segs, loff_t *ppos)
1310 struct vvp_io_args *args;
1316 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1320 env = cl_env_get(&refcheck);
1322 RETURN(PTR_ERR(env));
1324 args = vvp_env_args(env, IO_NORMAL);
1325 args->u.normal.via_iov = (struct iovec *)iov;
1326 args->u.normal.via_nrsegs = nr_segs;
1328 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1329 cl_env_put(env, &refcheck);
1333 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1337 struct iovec *local_iov;
1342 env = cl_env_get(&refcheck);
1344 RETURN(PTR_ERR(env));
1346 local_iov = &vvp_env_info(env)->vti_local_iov;
1347 local_iov->iov_base = (void __user *)buf;
1348 local_iov->iov_len = count;
1350 result = ll_file_writev(file, local_iov, 1, ppos);
1351 cl_env_put(env, &refcheck);
1355 #else /* AIO stuff */
1356 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1357 unsigned long nr_segs, loff_t pos)
1360 struct vvp_io_args *args;
1366 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1370 env = cl_env_get(&refcheck);
1372 RETURN(PTR_ERR(env));
1374 args = vvp_env_args(env, IO_NORMAL);
1375 args->u.normal.via_iov = (struct iovec *)iov;
1376 args->u.normal.via_nrsegs = nr_segs;
1377 args->u.normal.via_iocb = iocb;
1379 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1380 &iocb->ki_pos, count);
1381 cl_env_put(env, &refcheck);
1385 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1389 struct iovec *local_iov;
1390 struct kiocb *kiocb;
1395 env = cl_env_get(&refcheck);
1397 RETURN(PTR_ERR(env));
1399 local_iov = &vvp_env_info(env)->vti_local_iov;
1400 kiocb = &vvp_env_info(env)->vti_kiocb;
1401 local_iov->iov_base = (void __user *)buf;
1402 local_iov->iov_len = count;
1403 init_sync_kiocb(kiocb, file);
1404 kiocb->ki_pos = *ppos;
1405 kiocb->ki_left = count;
1407 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1408 *ppos = kiocb->ki_pos;
1410 cl_env_put(env, &refcheck);
1416 * Send file content (through pagecache) somewhere with helper
1418 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1419 struct pipe_inode_info *pipe, size_t count,
1423 struct vvp_io_args *args;
1428 env = cl_env_get(&refcheck);
1430 RETURN(PTR_ERR(env));
1432 args = vvp_env_args(env, IO_SPLICE);
1433 args->u.splice.via_pipe = pipe;
1434 args->u.splice.via_flags = flags;
1436 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1437 cl_env_put(env, &refcheck);
1441 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1444 struct obd_export *exp = ll_i2dtexp(inode);
1445 struct obd_trans_info oti = { 0 };
1446 struct obdo *oa = NULL;
1449 struct lov_stripe_md *lsm = NULL, *lsm2;
1456 lsm = ccc_inode_lsm_get(inode);
1457 if (!lsm_has_objects(lsm))
1458 GOTO(out, rc = -ENOENT);
1460 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1461 (lsm->lsm_stripe_count));
1463 OBD_ALLOC_LARGE(lsm2, lsm_size);
1465 GOTO(out, rc = -ENOMEM);
1468 oa->o_nlink = ost_idx;
1469 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1470 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1471 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1472 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1473 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1474 memcpy(lsm2, lsm, lsm_size);
1475 ll_inode_size_lock(inode);
1476 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1477 ll_inode_size_unlock(inode);
1479 OBD_FREE_LARGE(lsm2, lsm_size);
1482 ccc_inode_lsm_put(inode, lsm);
1487 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1489 struct ll_recreate_obj ucreat;
1493 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1496 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1500 ostid_set_seq_mdt0(&oi);
1501 ostid_set_id(&oi, ucreat.lrc_id);
1502 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1505 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1512 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1515 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1518 fid_to_ostid(&fid, &oi);
1519 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1520 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1523 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1524 int flags, struct lov_user_md *lum, int lum_size)
1526 struct lov_stripe_md *lsm = NULL;
1527 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1531 lsm = ccc_inode_lsm_get(inode);
1533 ccc_inode_lsm_put(inode, lsm);
1534 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1539 ll_inode_size_lock(inode);
1540 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1543 rc = oit.d.lustre.it_status;
1545 GOTO(out_req_free, rc);
1547 ll_release_openhandle(file->f_dentry, &oit);
1550 ll_inode_size_unlock(inode);
1551 ll_intent_release(&oit);
1552 ccc_inode_lsm_put(inode, lsm);
1555 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1559 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1560 struct lov_mds_md **lmmp, int *lmm_size,
1561 struct ptlrpc_request **request)
1563 struct ll_sb_info *sbi = ll_i2sbi(inode);
1564 struct mdt_body *body;
1565 struct lov_mds_md *lmm = NULL;
1566 struct ptlrpc_request *req = NULL;
1567 struct md_op_data *op_data;
1570 rc = ll_get_max_mdsize(sbi, &lmmsize);
1574 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1575 strlen(filename), lmmsize,
1576 LUSTRE_OPC_ANY, NULL);
1577 if (IS_ERR(op_data))
1578 RETURN(PTR_ERR(op_data));
1580 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1581 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1582 ll_finish_md_op_data(op_data);
1584 CDEBUG(D_INFO, "md_getattr_name failed "
1585 "on %s: rc %d\n", filename, rc);
1589 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1590 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1592 lmmsize = body->eadatasize;
1594 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1596 GOTO(out, rc = -ENODATA);
1599 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1600 LASSERT(lmm != NULL);
1602 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1603 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1604 GOTO(out, rc = -EPROTO);
1608 * This is coming from the MDS, so is probably in
1609 * little endian. We convert it to host endian before
1610 * passing it to userspace.
1612 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1615 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1616 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1619 /* if function called for directory - we should
1620 * avoid swab not existent lsm objects */
1621 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1622 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1623 if (S_ISREG(body->mode))
1624 lustre_swab_lov_user_md_objects(
1625 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1627 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1628 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1629 if (S_ISREG(body->mode))
1630 lustre_swab_lov_user_md_objects(
1631 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1638 *lmm_size = lmmsize;
1643 static int ll_lov_setea(struct inode *inode, struct file *file,
1646 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1647 struct lov_user_md *lump;
1648 int lum_size = sizeof(struct lov_user_md) +
1649 sizeof(struct lov_user_ost_data);
1653 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1656 OBD_ALLOC_LARGE(lump, lum_size);
1660 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1661 OBD_FREE_LARGE(lump, lum_size);
1665 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1667 OBD_FREE_LARGE(lump, lum_size);
1671 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1674 struct lov_user_md_v3 lumv3;
1675 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1676 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1677 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1679 int flags = FMODE_WRITE;
1682 /* first try with v1 which is smaller than v3 */
1683 lum_size = sizeof(struct lov_user_md_v1);
1684 if (copy_from_user(lumv1, lumv1p, lum_size))
1687 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1688 lum_size = sizeof(struct lov_user_md_v3);
1689 if (copy_from_user(&lumv3, lumv3p, lum_size))
1693 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1695 struct lov_stripe_md *lsm;
1698 put_user(0, &lumv1p->lmm_stripe_count);
1700 ll_layout_refresh(inode, &gen);
1701 lsm = ccc_inode_lsm_get(inode);
1702 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1703 0, lsm, (void *)arg);
1704 ccc_inode_lsm_put(inode, lsm);
1709 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1711 struct lov_stripe_md *lsm;
1715 lsm = ccc_inode_lsm_get(inode);
1717 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1719 ccc_inode_lsm_put(inode, lsm);
1723 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1725 struct ll_inode_info *lli = ll_i2info(inode);
1726 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1727 struct ccc_grouplock grouplock;
1731 if (ll_file_nolock(file))
1732 RETURN(-EOPNOTSUPP);
1734 spin_lock(&lli->lli_lock);
1735 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1736 CWARN("group lock already existed with gid %lu\n",
1737 fd->fd_grouplock.cg_gid);
1738 spin_unlock(&lli->lli_lock);
1741 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1742 spin_unlock(&lli->lli_lock);
1744 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1745 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1749 spin_lock(&lli->lli_lock);
1750 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1751 spin_unlock(&lli->lli_lock);
1752 CERROR("another thread just won the race\n");
1753 cl_put_grouplock(&grouplock);
1757 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1758 fd->fd_grouplock = grouplock;
1759 spin_unlock(&lli->lli_lock);
1761 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1765 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1767 struct ll_inode_info *lli = ll_i2info(inode);
1768 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1769 struct ccc_grouplock grouplock;
1772 spin_lock(&lli->lli_lock);
1773 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1774 spin_unlock(&lli->lli_lock);
1775 CWARN("no group lock held\n");
1778 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1780 if (fd->fd_grouplock.cg_gid != arg) {
1781 CWARN("group lock %lu doesn't match current id %lu\n",
1782 arg, fd->fd_grouplock.cg_gid);
1783 spin_unlock(&lli->lli_lock);
1787 grouplock = fd->fd_grouplock;
1788 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1789 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1790 spin_unlock(&lli->lli_lock);
1792 cl_put_grouplock(&grouplock);
1793 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1798 * Close inode open handle
1800 * \param dentry [in] dentry which contains the inode
1801 * \param it [in,out] intent which contains open info and result
1804 * \retval <0 failure
1806 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1808 struct inode *inode = dentry->d_inode;
1809 struct obd_client_handle *och;
1815 /* Root ? Do nothing. */
1816 if (dentry->d_inode->i_sb->s_root == dentry)
1819 /* No open handle to close? Move away */
1820 if (!it_disposition(it, DISP_OPEN_OPEN))
1823 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1825 OBD_ALLOC(och, sizeof(*och));
1827 GOTO(out, rc = -ENOMEM);
1829 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1831 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1834 /* this one is in place of ll_file_open */
1835 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1836 ptlrpc_req_finished(it->d.lustre.it_data);
1837 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1843 * Get size for inode for which FIEMAP mapping is requested.
1844 * Make the FIEMAP get_info call and returns the result.
1846 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1849 struct obd_export *exp = ll_i2dtexp(inode);
1850 struct lov_stripe_md *lsm = NULL;
1851 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1852 int vallen = num_bytes;
1856 /* Checks for fiemap flags */
1857 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1858 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1862 /* Check for FIEMAP_FLAG_SYNC */
1863 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1864 rc = filemap_fdatawrite(inode->i_mapping);
1869 lsm = ccc_inode_lsm_get(inode);
1873 /* If the stripe_count > 1 and the application does not understand
1874 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1876 if (lsm->lsm_stripe_count > 1 &&
1877 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1878 GOTO(out, rc = -EOPNOTSUPP);
1880 fm_key.oa.o_oi = lsm->lsm_oi;
1881 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1883 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1884 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1885 /* If filesize is 0, then there would be no objects for mapping */
1886 if (fm_key.oa.o_size == 0) {
1887 fiemap->fm_mapped_extents = 0;
1891 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1893 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1896 CERROR("obd_get_info failed: rc = %d\n", rc);
1899 ccc_inode_lsm_put(inode, lsm);
1903 int ll_fid2path(struct inode *inode, void *arg)
1905 struct obd_export *exp = ll_i2mdexp(inode);
1906 struct getinfo_fid2path *gfout, *gfin;
1910 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1911 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1914 /* Need to get the buflen */
1915 OBD_ALLOC_PTR(gfin);
1918 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1923 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1924 OBD_ALLOC(gfout, outsize);
1925 if (gfout == NULL) {
1929 memcpy(gfout, gfin, sizeof(*gfout));
1932 /* Call mdc_iocontrol */
1933 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1937 if (copy_to_user(arg, gfout, outsize))
1941 OBD_FREE(gfout, outsize);
1945 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1947 struct ll_user_fiemap *fiemap_s;
1948 size_t num_bytes, ret_bytes;
1949 unsigned int extent_count;
1952 /* Get the extent count so we can calculate the size of
1953 * required fiemap buffer */
1954 if (get_user(extent_count,
1955 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1957 num_bytes = sizeof(*fiemap_s) + (extent_count *
1958 sizeof(struct ll_fiemap_extent));
1960 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1961 if (fiemap_s == NULL)
1964 /* get the fiemap value */
1965 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1967 GOTO(error, rc = -EFAULT);
1969 /* If fm_extent_count is non-zero, read the first extent since
1970 * it is used to calculate end_offset and device from previous
1973 if (copy_from_user(&fiemap_s->fm_extents[0],
1974 (char __user *)arg + sizeof(*fiemap_s),
1975 sizeof(struct ll_fiemap_extent)))
1976 GOTO(error, rc = -EFAULT);
1979 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1983 ret_bytes = sizeof(struct ll_user_fiemap);
1985 if (extent_count != 0)
1986 ret_bytes += (fiemap_s->fm_mapped_extents *
1987 sizeof(struct ll_fiemap_extent));
1989 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1993 OBD_FREE_LARGE(fiemap_s, num_bytes);
1998 * Read the data_version for inode.
2000 * This value is computed using stripe object version on OST.
2001 * Version is computed using server side locking.
2003 * @param extent_lock Take extent lock. Not needed if a process is already
2004 * holding the OST object group locks.
2006 int ll_data_version(struct inode *inode, __u64 *data_version,
2009 struct lov_stripe_md *lsm = NULL;
2010 struct ll_sb_info *sbi = ll_i2sbi(inode);
2011 struct obdo *obdo = NULL;
2015 /* If no stripe, we consider version is 0. */
2016 lsm = ccc_inode_lsm_get(inode);
2017 if (!lsm_has_objects(lsm)) {
2019 CDEBUG(D_INODE, "No object for inode\n");
2023 OBD_ALLOC_PTR(obdo);
2025 GOTO(out, rc = -ENOMEM);
2027 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
2029 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2032 *data_version = obdo->o_data_version;
2038 ccc_inode_lsm_put(inode, lsm);
2042 struct ll_swap_stack {
2043 struct iattr ia1, ia2;
2045 struct inode *inode1, *inode2;
2046 bool check_dv1, check_dv2;
2049 static int ll_swap_layouts(struct file *file1, struct file *file2,
2050 struct lustre_swap_layouts *lsl)
2052 struct mdc_swap_layouts msl;
2053 struct md_op_data *op_data;
2056 struct ll_swap_stack *llss = NULL;
2059 OBD_ALLOC_PTR(llss);
2063 llss->inode1 = file1->f_dentry->d_inode;
2064 llss->inode2 = file2->f_dentry->d_inode;
2066 if (!S_ISREG(llss->inode2->i_mode))
2067 GOTO(free, rc = -EINVAL);
2069 if (inode_permission(llss->inode1, MAY_WRITE) ||
2070 inode_permission(llss->inode2, MAY_WRITE))
2071 GOTO(free, rc = -EPERM);
2073 if (llss->inode2->i_sb != llss->inode1->i_sb)
2074 GOTO(free, rc = -EXDEV);
2076 /* we use 2 bool because it is easier to swap than 2 bits */
2077 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2078 llss->check_dv1 = true;
2080 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2081 llss->check_dv2 = true;
2083 /* we cannot use lsl->sl_dvX directly because we may swap them */
2084 llss->dv1 = lsl->sl_dv1;
2085 llss->dv2 = lsl->sl_dv2;
2087 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2088 if (rc == 0) /* same file, done! */
2091 if (rc < 0) { /* sequentialize it */
2092 swap(llss->inode1, llss->inode2);
2094 swap(llss->dv1, llss->dv2);
2095 swap(llss->check_dv1, llss->check_dv2);
2099 if (gid != 0) { /* application asks to flush dirty cache */
2100 rc = ll_get_grouplock(llss->inode1, file1, gid);
2104 rc = ll_get_grouplock(llss->inode2, file2, gid);
2106 ll_put_grouplock(llss->inode1, file1, gid);
2111 /* to be able to restore mtime and atime after swap
2112 * we need to first save them */
2114 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2115 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2116 llss->ia1.ia_atime = llss->inode1->i_atime;
2117 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2118 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2119 llss->ia2.ia_atime = llss->inode2->i_atime;
2120 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2123 /* ultimate check, before swaping the layouts we check if
2124 * dataversion has changed (if requested) */
2125 if (llss->check_dv1) {
2126 rc = ll_data_version(llss->inode1, &dv, 0);
2129 if (dv != llss->dv1)
2130 GOTO(putgl, rc = -EAGAIN);
2133 if (llss->check_dv2) {
2134 rc = ll_data_version(llss->inode2, &dv, 0);
2137 if (dv != llss->dv2)
2138 GOTO(putgl, rc = -EAGAIN);
2141 /* struct md_op_data is used to send the swap args to the mdt
2142 * only flags is missing, so we use struct mdc_swap_layouts
2143 * through the md_op_data->op_data */
2144 /* flags from user space have to be converted before they are send to
2145 * server, no flag is sent today, they are only used on the client */
2148 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2149 0, LUSTRE_OPC_ANY, &msl);
2150 if (IS_ERR(op_data))
2151 GOTO(free, rc = PTR_ERR(op_data));
2153 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2154 sizeof(*op_data), op_data, NULL);
2155 ll_finish_md_op_data(op_data);
2159 ll_put_grouplock(llss->inode2, file2, gid);
2160 ll_put_grouplock(llss->inode1, file1, gid);
2163 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2167 /* clear useless flags */
2168 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2169 llss->ia1.ia_valid &= ~ATTR_MTIME;
2170 llss->ia2.ia_valid &= ~ATTR_MTIME;
2173 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2174 llss->ia1.ia_valid &= ~ATTR_ATIME;
2175 llss->ia2.ia_valid &= ~ATTR_ATIME;
2178 /* update time if requested */
2180 if (llss->ia2.ia_valid != 0) {
2181 mutex_lock(&llss->inode1->i_mutex);
2182 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2183 mutex_unlock(&llss->inode1->i_mutex);
2186 if (llss->ia1.ia_valid != 0) {
2189 mutex_lock(&llss->inode2->i_mutex);
2190 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2191 mutex_unlock(&llss->inode2->i_mutex);
2203 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2205 struct inode *inode = file->f_dentry->d_inode;
2206 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2210 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2211 inode->i_generation, inode, cmd);
2212 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2214 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2215 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2219 case LL_IOC_GETFLAGS:
2220 /* Get the current value of the file flags */
2221 return put_user(fd->fd_flags, (int *)arg);
2222 case LL_IOC_SETFLAGS:
2223 case LL_IOC_CLRFLAGS:
2224 /* Set or clear specific file flags */
2225 /* XXX This probably needs checks to ensure the flags are
2226 * not abused, and to handle any flag side effects.
2228 if (get_user(flags, (int *) arg))
2231 if (cmd == LL_IOC_SETFLAGS) {
2232 if ((flags & LL_FILE_IGNORE_LOCK) &&
2233 !(file->f_flags & O_DIRECT)) {
2234 CERROR("%s: unable to disable locking on "
2235 "non-O_DIRECT file\n", current->comm);
2239 fd->fd_flags |= flags;
2241 fd->fd_flags &= ~flags;
2244 case LL_IOC_LOV_SETSTRIPE:
2245 RETURN(ll_lov_setstripe(inode, file, arg));
2246 case LL_IOC_LOV_SETEA:
2247 RETURN(ll_lov_setea(inode, file, arg));
2248 case LL_IOC_LOV_SWAP_LAYOUTS: {
2250 struct lustre_swap_layouts lsl;
2252 if (copy_from_user(&lsl, (char *)arg,
2253 sizeof(struct lustre_swap_layouts)))
2256 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2259 file2 = fget(lsl.sl_fd);
2264 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2265 rc = ll_swap_layouts(file, file2, &lsl);
2269 case LL_IOC_LOV_GETSTRIPE:
2270 RETURN(ll_lov_getstripe(inode, arg));
2271 case LL_IOC_RECREATE_OBJ:
2272 RETURN(ll_lov_recreate_obj(inode, arg));
2273 case LL_IOC_RECREATE_FID:
2274 RETURN(ll_lov_recreate_fid(inode, arg));
2275 case FSFILT_IOC_FIEMAP:
2276 RETURN(ll_ioctl_fiemap(inode, arg));
2277 case FSFILT_IOC_GETFLAGS:
2278 case FSFILT_IOC_SETFLAGS:
2279 RETURN(ll_iocontrol(inode, file, cmd, arg));
2280 case FSFILT_IOC_GETVERSION_OLD:
2281 case FSFILT_IOC_GETVERSION:
2282 RETURN(put_user(inode->i_generation, (int *)arg));
2283 case LL_IOC_GROUP_LOCK:
2284 RETURN(ll_get_grouplock(inode, file, arg));
2285 case LL_IOC_GROUP_UNLOCK:
2286 RETURN(ll_put_grouplock(inode, file, arg));
2287 case IOC_OBD_STATFS:
2288 RETURN(ll_obd_statfs(inode, (void *)arg));
2290 /* We need to special case any other ioctls we want to handle,
2291 * to send them to the MDS/OST as appropriate and to properly
2292 * network encode the arg field.
2293 case FSFILT_IOC_SETVERSION_OLD:
2294 case FSFILT_IOC_SETVERSION:
2296 case LL_IOC_FLUSHCTX:
2297 RETURN(ll_flush_ctx(inode));
2298 case LL_IOC_PATH2FID: {
2299 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2300 sizeof(struct lu_fid)))
2305 case OBD_IOC_FID2PATH:
2306 RETURN(ll_fid2path(inode, (void *)arg));
2307 case LL_IOC_DATA_VERSION: {
2308 struct ioc_data_version idv;
2311 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2314 rc = ll_data_version(inode, &idv.idv_version,
2315 !(idv.idv_flags & LL_DV_NOFLUSH));
2317 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2323 case LL_IOC_GET_MDTIDX: {
2326 mdtidx = ll_get_mdt_idx(inode);
2330 if (put_user((int)mdtidx, (int*)arg))
2335 case OBD_IOC_GETDTNAME:
2336 case OBD_IOC_GETMDNAME:
2337 RETURN(ll_get_obd_name(inode, cmd, arg));
2338 case LL_IOC_HSM_STATE_GET: {
2339 struct md_op_data *op_data;
2340 struct hsm_user_state *hus;
2347 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2348 LUSTRE_OPC_ANY, hus);
2349 if (IS_ERR(op_data)) {
2351 RETURN(PTR_ERR(op_data));
2354 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2357 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2360 ll_finish_md_op_data(op_data);
2364 case LL_IOC_HSM_STATE_SET: {
2365 struct md_op_data *op_data;
2366 struct hsm_state_set *hss;
2372 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2377 /* Non-root users are forbidden to set or clear flags which are
2378 * NOT defined in HSM_USER_MASK. */
2379 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2380 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2385 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2386 LUSTRE_OPC_ANY, hss);
2387 if (IS_ERR(op_data)) {
2389 RETURN(PTR_ERR(op_data));
2392 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2395 ll_finish_md_op_data(op_data);
2400 case LL_IOC_HSM_ACTION: {
2401 struct md_op_data *op_data;
2402 struct hsm_current_action *hca;
2409 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2410 LUSTRE_OPC_ANY, hca);
2411 if (IS_ERR(op_data)) {
2413 RETURN(PTR_ERR(op_data));
2416 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2419 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2422 ll_finish_md_op_data(op_data);
2426 case LL_IOC_SET_LEASE: {
2427 struct ll_inode_info *lli = ll_i2info(inode);
2428 struct obd_client_handle *och = NULL;
2434 if (!(file->f_mode & FMODE_WRITE))
2439 if (!(file->f_mode & FMODE_READ))
2444 mutex_lock(&lli->lli_och_mutex);
2445 if (fd->fd_lease_och != NULL) {
2446 och = fd->fd_lease_och;
2447 fd->fd_lease_och = NULL;
2449 mutex_unlock(&lli->lli_och_mutex);
2452 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2453 rc = ll_lease_close(och, inode, &lease_broken);
2454 if (rc == 0 && lease_broken)
2460 /* return the type of lease or error */
2461 RETURN(rc < 0 ? rc : (int)mode);
2466 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2468 /* apply for lease */
2469 och = ll_lease_open(inode, file, mode);
2471 RETURN(PTR_ERR(och));
2474 mutex_lock(&lli->lli_och_mutex);
2475 if (fd->fd_lease_och == NULL) {
2476 fd->fd_lease_och = och;
2479 mutex_unlock(&lli->lli_och_mutex);
2481 /* impossible now that only excl is supported for now */
2482 ll_lease_close(och, inode, &lease_broken);
2487 case LL_IOC_GET_LEASE: {
2488 struct ll_inode_info *lli = ll_i2info(inode);
2489 struct ldlm_lock *lock = NULL;
2492 mutex_lock(&lli->lli_och_mutex);
2493 if (fd->fd_lease_och != NULL) {
2494 struct obd_client_handle *och = fd->fd_lease_och;
2496 lock = ldlm_handle2lock(&och->och_lease_handle);
2498 lock_res_and_lock(lock);
2499 if (!ldlm_is_cancel(lock))
2500 rc = och->och_flags &
2501 (FMODE_READ | FMODE_WRITE);
2502 unlock_res_and_lock(lock);
2503 ldlm_lock_put(lock);
2506 mutex_unlock(&lli->lli_och_mutex);
2514 ll_iocontrol_call(inode, file, cmd, arg, &err))
2517 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2523 #ifndef HAVE_FILE_LLSEEK_SIZE
2524 static inline loff_t
2525 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2527 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2529 if (offset > maxsize)
2532 if (offset != file->f_pos) {
2533 file->f_pos = offset;
2534 file->f_version = 0;
2540 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2541 loff_t maxsize, loff_t eof)
2543 struct inode *inode = file->f_dentry->d_inode;
2551 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2552 * position-querying operation. Avoid rewriting the "same"
2553 * f_pos value back to the file because a concurrent read(),
2554 * write() or lseek() might have altered it
2559 * f_lock protects against read/modify/write race with other
2560 * SEEK_CURs. Note that parallel writes and reads behave
2563 mutex_lock(&inode->i_mutex);
2564 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2565 mutex_unlock(&inode->i_mutex);
2569 * In the generic case the entire file is data, so as long as
2570 * offset isn't at the end of the file then the offset is data.
2577 * There is a virtual hole at the end of the file, so as long as
2578 * offset isn't i_size or larger, return i_size.
2586 return llseek_execute(file, offset, maxsize);
2590 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2592 struct inode *inode = file->f_dentry->d_inode;
2593 loff_t retval, eof = 0;
2596 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2597 (origin == SEEK_CUR) ? file->f_pos : 0);
2598 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2599 inode->i_ino, inode->i_generation, inode, retval, retval,
2601 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2603 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2604 retval = ll_glimpse_size(inode);
2607 eof = i_size_read(inode);
2610 retval = ll_generic_file_llseek_size(file, offset, origin,
2611 ll_file_maxbytes(inode), eof);
2615 int ll_flush(struct file *file, fl_owner_t id)
2617 struct inode *inode = file->f_dentry->d_inode;
2618 struct ll_inode_info *lli = ll_i2info(inode);
2619 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2622 LASSERT(!S_ISDIR(inode->i_mode));
2624 /* catch async errors that were recorded back when async writeback
2625 * failed for pages in this mapping. */
2626 rc = lli->lli_async_rc;
2627 lli->lli_async_rc = 0;
2628 err = lov_read_and_clear_async_rc(lli->lli_clob);
2632 /* The application has been told write failure already.
2633 * Do not report failure again. */
2634 if (fd->fd_write_failed)
2636 return rc ? -EIO : 0;
2640 * Called to make sure a portion of file has been written out.
2641 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2643 * Return how many pages have been written.
2645 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2646 enum cl_fsync_mode mode, int ignore_layout)
2648 struct cl_env_nest nest;
2651 struct obd_capa *capa = NULL;
2652 struct cl_fsync_io *fio;
2656 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2657 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2660 env = cl_env_nested_get(&nest);
2662 RETURN(PTR_ERR(env));
2664 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2666 io = ccc_env_thread_io(env);
2667 io->ci_obj = cl_i2info(inode)->lli_clob;
2668 io->ci_ignore_layout = ignore_layout;
2670 /* initialize parameters for sync */
2671 fio = &io->u.ci_fsync;
2672 fio->fi_capa = capa;
2673 fio->fi_start = start;
2675 fio->fi_fid = ll_inode2fid(inode);
2676 fio->fi_mode = mode;
2677 fio->fi_nr_written = 0;
2679 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2680 result = cl_io_loop(env, io);
2682 result = io->ci_result;
2684 result = fio->fi_nr_written;
2685 cl_io_fini(env, io);
2686 cl_env_nested_put(&nest, env);
2694 * When dentry is provided (the 'else' case), *file->f_dentry may be
2695 * null and dentry must be used directly rather than pulled from
2696 * *file->f_dentry as is done otherwise.
2699 #ifdef HAVE_FILE_FSYNC_4ARGS
2700 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2702 struct dentry *dentry = file->f_dentry;
2703 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2704 int ll_fsync(struct file *file, int datasync)
2706 struct dentry *dentry = file->f_dentry;
2708 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2711 struct inode *inode = dentry->d_inode;
2712 struct ll_inode_info *lli = ll_i2info(inode);
2713 struct ptlrpc_request *req;
2714 struct obd_capa *oc;
2718 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2719 inode->i_generation, inode);
2720 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2722 #ifdef HAVE_FILE_FSYNC_4ARGS
2723 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2724 mutex_lock(&inode->i_mutex);
2726 /* fsync's caller has already called _fdata{sync,write}, we want
2727 * that IO to finish before calling the osc and mdc sync methods */
2728 rc = filemap_fdatawait(inode->i_mapping);
2731 /* catch async errors that were recorded back when async writeback
2732 * failed for pages in this mapping. */
2733 if (!S_ISDIR(inode->i_mode)) {
2734 err = lli->lli_async_rc;
2735 lli->lli_async_rc = 0;
2738 err = lov_read_and_clear_async_rc(lli->lli_clob);
2743 oc = ll_mdscapa_get(inode);
2744 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2750 ptlrpc_req_finished(req);
2752 if (datasync && S_ISREG(inode->i_mode)) {
2753 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2755 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2757 if (rc == 0 && err < 0)
2760 fd->fd_write_failed = true;
2762 fd->fd_write_failed = false;
2765 #ifdef HAVE_FILE_FSYNC_4ARGS
2766 mutex_unlock(&inode->i_mutex);
2771 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2773 struct inode *inode = file->f_dentry->d_inode;
2774 struct ll_sb_info *sbi = ll_i2sbi(inode);
2775 struct ldlm_enqueue_info einfo = {
2776 .ei_type = LDLM_FLOCK,
2777 .ei_cb_cp = ldlm_flock_completion_ast,
2778 .ei_cbdata = file_lock,
2780 struct md_op_data *op_data;
2781 struct lustre_handle lockh = {0};
2782 ldlm_policy_data_t flock = {{0}};
2788 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2789 inode->i_ino, file_lock);
2791 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2793 if (file_lock->fl_flags & FL_FLOCK) {
2794 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2795 /* flocks are whole-file locks */
2796 flock.l_flock.end = OFFSET_MAX;
2797 /* For flocks owner is determined by the local file desctiptor*/
2798 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2799 } else if (file_lock->fl_flags & FL_POSIX) {
2800 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2801 flock.l_flock.start = file_lock->fl_start;
2802 flock.l_flock.end = file_lock->fl_end;
2806 flock.l_flock.pid = file_lock->fl_pid;
2808 /* Somewhat ugly workaround for svc lockd.
2809 * lockd installs custom fl_lmops->lm_compare_owner that checks
2810 * for the fl_owner to be the same (which it always is on local node
2811 * I guess between lockd processes) and then compares pid.
2812 * As such we assign pid to the owner field to make it all work,
2813 * conflict with normal locks is unlikely since pid space and
2814 * pointer space for current->files are not intersecting */
2815 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2816 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2818 switch (file_lock->fl_type) {
2820 einfo.ei_mode = LCK_PR;
2823 /* An unlock request may or may not have any relation to
2824 * existing locks so we may not be able to pass a lock handle
2825 * via a normal ldlm_lock_cancel() request. The request may even
2826 * unlock a byte range in the middle of an existing lock. In
2827 * order to process an unlock request we need all of the same
2828 * information that is given with a normal read or write record
2829 * lock request. To avoid creating another ldlm unlock (cancel)
2830 * message we'll treat a LCK_NL flock request as an unlock. */
2831 einfo.ei_mode = LCK_NL;
2834 einfo.ei_mode = LCK_PW;
2837 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2838 file_lock->fl_type);
2853 flags = LDLM_FL_BLOCK_NOWAIT;
2859 flags = LDLM_FL_TEST_LOCK;
2860 /* Save the old mode so that if the mode in the lock changes we
2861 * can decrement the appropriate reader or writer refcount. */
2862 file_lock->fl_type = einfo.ei_mode;
2865 CERROR("unknown fcntl lock command: %d\n", cmd);
2869 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2870 LUSTRE_OPC_ANY, NULL);
2871 if (IS_ERR(op_data))
2872 RETURN(PTR_ERR(op_data));
2874 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2875 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2876 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2878 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2879 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2881 if ((file_lock->fl_flags & FL_FLOCK) &&
2882 (rc == 0 || file_lock->fl_type == F_UNLCK))
2883 rc2 = flock_lock_file_wait(file, file_lock);
2884 if ((file_lock->fl_flags & FL_POSIX) &&
2885 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2886 !(flags & LDLM_FL_TEST_LOCK))
2887 rc2 = posix_lock_file_wait(file, file_lock);
2889 if (rc2 && file_lock->fl_type != F_UNLCK) {
2890 einfo.ei_mode = LCK_NL;
2891 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2892 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2896 ll_finish_md_op_data(op_data);
2901 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2909 * test if some locks matching bits and l_req_mode are acquired
2910 * - bits can be in different locks
2911 * - if found clear the common lock bits in *bits
2912 * - the bits not found, are kept in *bits
2914 * \param bits [IN] searched lock bits [IN]
2915 * \param l_req_mode [IN] searched lock mode
2916 * \retval boolean, true iff all bits are found
2918 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2920 struct lustre_handle lockh;
2921 ldlm_policy_data_t policy;
2922 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2923 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2932 fid = &ll_i2info(inode)->lli_fid;
2933 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2934 ldlm_lockname[mode]);
2936 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2937 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2938 policy.l_inodebits.bits = *bits & (1 << i);
2939 if (policy.l_inodebits.bits == 0)
2942 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2943 &policy, mode, &lockh)) {
2944 struct ldlm_lock *lock;
2946 lock = ldlm_handle2lock(&lockh);
2949 ~(lock->l_policy_data.l_inodebits.bits);
2950 LDLM_LOCK_PUT(lock);
2952 *bits &= ~policy.l_inodebits.bits;
2959 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2960 struct lustre_handle *lockh, __u64 flags)
2962 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2967 fid = &ll_i2info(inode)->lli_fid;
2968 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2970 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2971 fid, LDLM_IBITS, &policy,
2972 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2976 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2978 /* Already unlinked. Just update nlink and return success */
2979 if (rc == -ENOENT) {
2981 /* This path cannot be hit for regular files unless in
2982 * case of obscure races, so no need to to validate
2984 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2986 } else if (rc != 0) {
2987 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2988 ll_get_fsname(inode->i_sb, NULL, 0),
2989 PFID(ll_inode2fid(inode)), rc);
2995 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2998 struct inode *inode = dentry->d_inode;
2999 struct ptlrpc_request *req = NULL;
3000 struct obd_export *exp;
3004 LASSERT(inode != NULL);
3006 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
3007 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
3009 exp = ll_i2mdexp(inode);
3011 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3012 * But under CMD case, it caused some lock issues, should be fixed
3013 * with new CMD ibits lock. See bug 12718 */
3014 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3015 struct lookup_intent oit = { .it_op = IT_GETATTR };
3016 struct md_op_data *op_data;
3018 if (ibits == MDS_INODELOCK_LOOKUP)
3019 oit.it_op = IT_LOOKUP;
3021 /* Call getattr by fid, so do not provide name at all. */
3022 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
3023 dentry->d_inode, NULL, 0, 0,
3024 LUSTRE_OPC_ANY, NULL);
3025 if (IS_ERR(op_data))
3026 RETURN(PTR_ERR(op_data));
3028 oit.it_create_mode |= M_CHECK_STALE;
3029 rc = md_intent_lock(exp, op_data, NULL, 0,
3030 /* we are not interested in name
3033 ll_md_blocking_ast, 0);
3034 ll_finish_md_op_data(op_data);
3035 oit.it_create_mode &= ~M_CHECK_STALE;
3037 rc = ll_inode_revalidate_fini(inode, rc);
3041 rc = ll_revalidate_it_finish(req, &oit, dentry);
3043 ll_intent_release(&oit);
3047 /* Unlinked? Unhash dentry, so it is not picked up later by
3048 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3049 here to preserve get_cwd functionality on 2.6.
3051 if (!dentry->d_inode->i_nlink)
3052 d_lustre_invalidate(dentry, 0);
3054 ll_lookup_finish_locks(&oit, dentry);
3055 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3056 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3057 obd_valid valid = OBD_MD_FLGETATTR;
3058 struct md_op_data *op_data;
3061 if (S_ISREG(inode->i_mode)) {
3062 rc = ll_get_max_mdsize(sbi, &ealen);
3065 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3068 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3069 0, ealen, LUSTRE_OPC_ANY,
3071 if (IS_ERR(op_data))
3072 RETURN(PTR_ERR(op_data));
3074 op_data->op_valid = valid;
3075 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3076 * capa for this inode. Because we only keep capas of dirs
3078 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3079 ll_finish_md_op_data(op_data);
3081 rc = ll_inode_revalidate_fini(inode, rc);
3085 rc = ll_prep_inode(&inode, req, NULL, NULL);
3088 ptlrpc_req_finished(req);
3092 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
3095 struct inode *inode = dentry->d_inode;
3099 rc = __ll_inode_revalidate_it(dentry, it, ibits);
3103 /* if object isn't regular file, don't validate size */
3104 if (!S_ISREG(inode->i_mode)) {
3105 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3106 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3107 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3109 /* In case of restore, the MDT has the right size and has
3110 * already send it back without granting the layout lock,
3111 * inode is up-to-date so glimpse is useless.
3112 * Also to glimpse we need the layout, in case of a running
3113 * restore the MDT holds the layout lock so the glimpse will
3114 * block up to the end of restore (getattr will block)
3116 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3117 rc = ll_glimpse_size(inode);
3122 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3123 struct lookup_intent *it, struct kstat *stat)
3125 struct inode *inode = de->d_inode;
3126 struct ll_sb_info *sbi = ll_i2sbi(inode);
3127 struct ll_inode_info *lli = ll_i2info(inode);
3130 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3131 MDS_INODELOCK_LOOKUP);
3132 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3137 stat->dev = inode->i_sb->s_dev;
3138 if (ll_need_32bit_api(sbi))
3139 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3141 stat->ino = inode->i_ino;
3142 stat->mode = inode->i_mode;
3143 stat->nlink = inode->i_nlink;
3144 stat->uid = inode->i_uid;
3145 stat->gid = inode->i_gid;
3146 stat->rdev = inode->i_rdev;
3147 stat->atime = inode->i_atime;
3148 stat->mtime = inode->i_mtime;
3149 stat->ctime = inode->i_ctime;
3150 stat->blksize = 1 << inode->i_blkbits;
3152 stat->size = i_size_read(inode);
3153 stat->blocks = inode->i_blocks;
3157 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3159 struct lookup_intent it = { .it_op = IT_GETATTR };
3161 return ll_getattr_it(mnt, de, &it, stat);
3164 #ifdef HAVE_LINUX_FIEMAP_H
3165 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3166 __u64 start, __u64 len)
3170 struct ll_user_fiemap *fiemap;
3171 unsigned int extent_count = fieinfo->fi_extents_max;
3173 num_bytes = sizeof(*fiemap) + (extent_count *
3174 sizeof(struct ll_fiemap_extent));
3175 OBD_ALLOC_LARGE(fiemap, num_bytes);
3180 fiemap->fm_flags = fieinfo->fi_flags;
3181 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3182 fiemap->fm_start = start;
3183 fiemap->fm_length = len;
3184 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3185 sizeof(struct ll_fiemap_extent));
3187 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3189 fieinfo->fi_flags = fiemap->fm_flags;
3190 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3191 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3192 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3194 OBD_FREE_LARGE(fiemap, num_bytes);
3199 struct posix_acl * ll_get_acl(struct inode *inode, int type)
3201 struct ll_inode_info *lli = ll_i2info(inode);
3202 struct posix_acl *acl = NULL;
3205 spin_lock(&lli->lli_lock);
3206 /* VFS' acl_permission_check->check_acl will release the refcount */
3207 acl = posix_acl_dup(lli->lli_posix_acl);
3208 spin_unlock(&lli->lli_lock);
3213 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3215 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3216 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3218 ll_check_acl(struct inode *inode, int mask)
3221 # ifdef CONFIG_FS_POSIX_ACL
3222 struct posix_acl *acl;
3226 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3227 if (flags & IPERM_FLAG_RCU)
3230 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3235 rc = posix_acl_permission(inode, acl, mask);
3236 posix_acl_release(acl);
3239 # else /* !CONFIG_FS_POSIX_ACL */
3241 # endif /* CONFIG_FS_POSIX_ACL */
3243 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3245 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3246 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3248 # ifdef HAVE_INODE_PERMISION_2ARGS
3249 int ll_inode_permission(struct inode *inode, int mask)
3251 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3258 #ifdef MAY_NOT_BLOCK
3259 if (mask & MAY_NOT_BLOCK)
3261 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3262 if (flags & IPERM_FLAG_RCU)
3266 /* as root inode are NOT getting validated in lookup operation,
3267 * need to do it before permission check. */
3269 if (inode == inode->i_sb->s_root->d_inode) {
3270 struct lookup_intent it = { .it_op = IT_LOOKUP };
3272 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3273 MDS_INODELOCK_LOOKUP);
3278 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3279 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3281 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3282 return lustre_check_remote_perm(inode, mask);
3284 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3285 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3290 #ifdef HAVE_FILE_READV
3291 #define READ_METHOD readv
3292 #define READ_FUNCTION ll_file_readv
3293 #define WRITE_METHOD writev
3294 #define WRITE_FUNCTION ll_file_writev
3296 #define READ_METHOD aio_read
3297 #define READ_FUNCTION ll_file_aio_read
3298 #define WRITE_METHOD aio_write
3299 #define WRITE_FUNCTION ll_file_aio_write
3302 /* -o localflock - only provides locally consistent flock locks */
3303 struct file_operations ll_file_operations = {
3304 .read = ll_file_read,
3305 .READ_METHOD = READ_FUNCTION,
3306 .write = ll_file_write,
3307 .WRITE_METHOD = WRITE_FUNCTION,
3308 .unlocked_ioctl = ll_file_ioctl,
3309 .open = ll_file_open,
3310 .release = ll_file_release,
3311 .mmap = ll_file_mmap,
3312 .llseek = ll_file_seek,
3313 .splice_read = ll_file_splice_read,
3318 struct file_operations ll_file_operations_flock = {
3319 .read = ll_file_read,
3320 .READ_METHOD = READ_FUNCTION,
3321 .write = ll_file_write,
3322 .WRITE_METHOD = WRITE_FUNCTION,
3323 .unlocked_ioctl = ll_file_ioctl,
3324 .open = ll_file_open,
3325 .release = ll_file_release,
3326 .mmap = ll_file_mmap,
3327 .llseek = ll_file_seek,
3328 .splice_read = ll_file_splice_read,
3331 .flock = ll_file_flock,
3332 .lock = ll_file_flock
3335 /* These are for -o noflock - to return ENOSYS on flock calls */
3336 struct file_operations ll_file_operations_noflock = {
3337 .read = ll_file_read,
3338 .READ_METHOD = READ_FUNCTION,
3339 .write = ll_file_write,
3340 .WRITE_METHOD = WRITE_FUNCTION,
3341 .unlocked_ioctl = ll_file_ioctl,
3342 .open = ll_file_open,
3343 .release = ll_file_release,
3344 .mmap = ll_file_mmap,
3345 .llseek = ll_file_seek,
3346 .splice_read = ll_file_splice_read,
3349 .flock = ll_file_noflock,
3350 .lock = ll_file_noflock
3353 struct inode_operations ll_file_inode_operations = {
3354 .setattr = ll_setattr,
3355 .getattr = ll_getattr,
3356 .permission = ll_inode_permission,
3357 .setxattr = ll_setxattr,
3358 .getxattr = ll_getxattr,
3359 .listxattr = ll_listxattr,
3360 .removexattr = ll_removexattr,
3361 #ifdef HAVE_LINUX_FIEMAP_H
3362 .fiemap = ll_fiemap,
3364 #ifdef HAVE_IOP_GET_ACL
3365 .get_acl = ll_get_acl,
3369 /* dynamic ioctl number support routins */
3370 static struct llioc_ctl_data {
3371 struct rw_semaphore ioc_sem;
3372 cfs_list_t ioc_head;
3374 __RWSEM_INITIALIZER(llioc.ioc_sem),
3375 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3380 cfs_list_t iocd_list;
3381 unsigned int iocd_size;
3382 llioc_callback_t iocd_cb;
3383 unsigned int iocd_count;
3384 unsigned int iocd_cmd[0];
3387 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3390 struct llioc_data *in_data = NULL;
3393 if (cb == NULL || cmd == NULL ||
3394 count > LLIOC_MAX_CMD || count < 0)
3397 size = sizeof(*in_data) + count * sizeof(unsigned int);
3398 OBD_ALLOC(in_data, size);
3399 if (in_data == NULL)
3402 memset(in_data, 0, sizeof(*in_data));
3403 in_data->iocd_size = size;
3404 in_data->iocd_cb = cb;
3405 in_data->iocd_count = count;
3406 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3408 down_write(&llioc.ioc_sem);
3409 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3410 up_write(&llioc.ioc_sem);
3415 void ll_iocontrol_unregister(void *magic)
3417 struct llioc_data *tmp;
3422 down_write(&llioc.ioc_sem);
3423 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3425 unsigned int size = tmp->iocd_size;
3427 cfs_list_del(&tmp->iocd_list);
3428 up_write(&llioc.ioc_sem);
3430 OBD_FREE(tmp, size);
3434 up_write(&llioc.ioc_sem);
3436 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3439 EXPORT_SYMBOL(ll_iocontrol_register);
3440 EXPORT_SYMBOL(ll_iocontrol_unregister);
3442 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3443 unsigned int cmd, unsigned long arg, int *rcp)
3445 enum llioc_iter ret = LLIOC_CONT;
3446 struct llioc_data *data;
3447 int rc = -EINVAL, i;
3449 down_read(&llioc.ioc_sem);
3450 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3451 for (i = 0; i < data->iocd_count; i++) {
3452 if (cmd != data->iocd_cmd[i])
3455 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3459 if (ret == LLIOC_STOP)
3462 up_read(&llioc.ioc_sem);
3469 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3471 struct ll_inode_info *lli = ll_i2info(inode);
3472 struct cl_env_nest nest;
3477 if (lli->lli_clob == NULL)
3480 env = cl_env_nested_get(&nest);
3482 RETURN(PTR_ERR(env));
3484 result = cl_conf_set(env, lli->lli_clob, conf);
3485 cl_env_nested_put(&nest, env);
3487 if (conf->coc_opc == OBJECT_CONF_SET) {
3488 struct ldlm_lock *lock = conf->coc_lock;
3490 LASSERT(lock != NULL);
3491 LASSERT(ldlm_has_layout(lock));
3493 /* it can only be allowed to match after layout is
3494 * applied to inode otherwise false layout would be
3495 * seen. Applying layout shoud happen before dropping
3496 * the intent lock. */
3497 ldlm_lock_allow_match(lock);
3503 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3504 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3507 struct ll_sb_info *sbi = ll_i2sbi(inode);
3508 struct obd_capa *oc;
3509 struct ptlrpc_request *req;
3510 struct mdt_body *body;
3517 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3518 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3519 lock->l_lvb_data, lock->l_lvb_len);
3521 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3524 /* if layout lock was granted right away, the layout is returned
3525 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3526 * blocked and then granted via completion ast, we have to fetch
3527 * layout here. Please note that we can't use the LVB buffer in
3528 * completion AST because it doesn't have a large enough buffer */
3529 oc = ll_mdscapa_get(inode);
3530 rc = ll_get_max_mdsize(sbi, &lmmsize);
3532 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3533 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3539 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3540 if (body == NULL || body->eadatasize > lmmsize)
3541 GOTO(out, rc = -EPROTO);
3543 lmmsize = body->eadatasize;
3544 if (lmmsize == 0) /* empty layout */
3547 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3549 GOTO(out, rc = -EFAULT);
3551 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3552 if (lvbdata == NULL)
3553 GOTO(out, rc = -ENOMEM);
3555 memcpy(lvbdata, lmm, lmmsize);
3556 lock_res_and_lock(lock);
3557 if (lock->l_lvb_data != NULL)
3558 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3560 lock->l_lvb_data = lvbdata;
3561 lock->l_lvb_len = lmmsize;
3562 unlock_res_and_lock(lock);
3567 ptlrpc_req_finished(req);
3572 * Apply the layout to the inode. Layout lock is held and will be released
3575 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3576 struct inode *inode, __u32 *gen, bool reconf)
3578 struct ll_inode_info *lli = ll_i2info(inode);
3579 struct ll_sb_info *sbi = ll_i2sbi(inode);
3580 struct ldlm_lock *lock;
3581 struct lustre_md md = { NULL };
3582 struct cl_object_conf conf;
3585 bool wait_layout = false;
3588 LASSERT(lustre_handle_is_used(lockh));
3590 lock = ldlm_handle2lock(lockh);
3591 LASSERT(lock != NULL);
3592 LASSERT(ldlm_has_layout(lock));
3594 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3595 inode, PFID(&lli->lli_fid), reconf);
3597 /* in case this is a caching lock and reinstate with new inode */
3598 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3600 lock_res_and_lock(lock);
3601 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3602 unlock_res_and_lock(lock);
3603 /* checking lvb_ready is racy but this is okay. The worst case is
3604 * that multi processes may configure the file on the same time. */
3606 if (lvb_ready || !reconf) {
3609 /* layout_gen must be valid if layout lock is not
3610 * cancelled and stripe has already set */
3611 *gen = lli->lli_layout_gen;
3617 rc = ll_layout_fetch(inode, lock);
3621 /* for layout lock, lmm is returned in lock's lvb.
3622 * lvb_data is immutable if the lock is held so it's safe to access it
3623 * without res lock. See the description in ldlm_lock_decref_internal()
3624 * for the condition to free lvb_data of layout lock */
3625 if (lock->l_lvb_data != NULL) {
3626 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3627 lock->l_lvb_data, lock->l_lvb_len);
3629 *gen = LL_LAYOUT_GEN_EMPTY;
3631 *gen = md.lsm->lsm_layout_gen;
3634 CERROR("%s: file "DFID" unpackmd error: %d\n",
3635 ll_get_fsname(inode->i_sb, NULL, 0),
3636 PFID(&lli->lli_fid), rc);
3642 /* set layout to file. Unlikely this will fail as old layout was
3643 * surely eliminated */
3644 memset(&conf, 0, sizeof conf);
3645 conf.coc_opc = OBJECT_CONF_SET;
3646 conf.coc_inode = inode;
3647 conf.coc_lock = lock;
3648 conf.u.coc_md = &md;
3649 rc = ll_layout_conf(inode, &conf);
3652 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3654 /* refresh layout failed, need to wait */
3655 wait_layout = rc == -EBUSY;
3659 LDLM_LOCK_PUT(lock);
3660 ldlm_lock_decref(lockh, mode);
3662 /* wait for IO to complete if it's still being used. */
3664 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3665 ll_get_fsname(inode->i_sb, NULL, 0),
3666 inode, PFID(&lli->lli_fid));
3668 memset(&conf, 0, sizeof conf);
3669 conf.coc_opc = OBJECT_CONF_WAIT;
3670 conf.coc_inode = inode;
3671 rc = ll_layout_conf(inode, &conf);
3675 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3676 PFID(&lli->lli_fid), rc);
3682 * This function checks if there exists a LAYOUT lock on the client side,
3683 * or enqueues it if it doesn't have one in cache.
3685 * This function will not hold layout lock so it may be revoked any time after
3686 * this function returns. Any operations depend on layout should be redone
3689 * This function should be called before lov_io_init() to get an uptodate
3690 * layout version, the caller should save the version number and after IO
3691 * is finished, this function should be called again to verify that layout
3692 * is not changed during IO time.
3694 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3696 struct ll_inode_info *lli = ll_i2info(inode);
3697 struct ll_sb_info *sbi = ll_i2sbi(inode);
3698 struct md_op_data *op_data;
3699 struct lookup_intent it;
3700 struct lustre_handle lockh;
3702 struct ldlm_enqueue_info einfo = {
3703 .ei_type = LDLM_IBITS,
3705 .ei_cb_bl = ll_md_blocking_ast,
3706 .ei_cb_cp = ldlm_completion_ast,
3711 *gen = lli->lli_layout_gen;
3712 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3716 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3717 LASSERT(S_ISREG(inode->i_mode));
3719 /* mostly layout lock is caching on the local side, so try to match
3720 * it before grabbing layout lock mutex. */
3721 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3722 if (mode != 0) { /* hit cached lock */
3723 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3727 /* better hold lli_layout_mutex to try again otherwise
3728 * it will have starvation problem. */
3731 /* take layout lock mutex to enqueue layout lock exclusively. */
3732 mutex_lock(&lli->lli_layout_mutex);
3735 /* try again. Maybe somebody else has done this. */
3736 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3737 if (mode != 0) { /* hit cached lock */
3738 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3742 mutex_unlock(&lli->lli_layout_mutex);
3746 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3747 0, 0, LUSTRE_OPC_ANY, NULL);
3748 if (IS_ERR(op_data)) {
3749 mutex_unlock(&lli->lli_layout_mutex);
3750 RETURN(PTR_ERR(op_data));
3753 /* have to enqueue one */
3754 memset(&it, 0, sizeof(it));
3755 it.it_op = IT_LAYOUT;
3756 lockh.cookie = 0ULL;
3758 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3759 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3760 PFID(&lli->lli_fid));
3762 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3764 if (it.d.lustre.it_data != NULL)
3765 ptlrpc_req_finished(it.d.lustre.it_data);
3766 it.d.lustre.it_data = NULL;
3768 ll_finish_md_op_data(op_data);
3770 mode = it.d.lustre.it_lock_mode;
3771 it.d.lustre.it_lock_mode = 0;
3772 ll_intent_drop_lock(&it);
3775 /* set lock data in case this is a new lock */
3776 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3777 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3781 mutex_unlock(&lli->lli_layout_mutex);
3787 * This function send a restore request to the MDT
3789 int ll_layout_restore(struct inode *inode)
3791 struct hsm_user_request *hur;
3795 len = sizeof(struct hsm_user_request) +
3796 sizeof(struct hsm_user_item);
3797 OBD_ALLOC(hur, len);
3801 hur->hur_request.hr_action = HUA_RESTORE;
3802 hur->hur_request.hr_archive_id = 0;
3803 hur->hur_request.hr_flags = 0;
3804 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3805 sizeof(hur->hur_user_item[0].hui_fid));
3806 hur->hur_user_item[0].hui_extent.length = -1;
3807 hur->hur_request.hr_itemcount = 1;
3808 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,