4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och)
125 struct obd_export *exp = ll_i2mdexp(inode);
126 struct md_op_data *op_data;
127 struct ptlrpc_request *req = NULL;
128 struct obd_device *obd = class_exp2obd(exp);
135 * XXX: in case of LMV, is this correct to access
138 CERROR("Invalid MDC connection handle "LPX64"\n",
139 ll_i2mdexp(inode)->exp_handle.h_cookie);
143 OBD_ALLOC_PTR(op_data);
145 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
147 ll_prepare_close(inode, op_data, och);
148 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
149 rc = md_close(md_exp, op_data, och->och_mod, &req);
151 /* This close must have the epoch closed. */
152 LASSERT(epoch_close);
153 /* MDS has instructed us to obtain Size-on-MDS attribute from
154 * OSTs and send setattr to back to MDS. */
155 rc = ll_som_update(inode, op_data);
157 CERROR("inode %lu mdc Size-on-MDS update failed: "
158 "rc = %d\n", inode->i_ino, rc);
162 CERROR("inode %lu mdc close failed: rc = %d\n",
166 /* DATA_MODIFIED flag was successfully sent on close, cancel data
167 * modification flag. */
168 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
169 struct ll_inode_info *lli = ll_i2info(inode);
171 spin_lock(&lli->lli_lock);
172 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
173 spin_unlock(&lli->lli_lock);
176 ll_finish_md_op_data(op_data);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
188 if (exp_connect_som(exp) && !epoch_close &&
189 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
190 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
192 md_clear_open_replay_data(md_exp, och);
193 /* Free @och if it is not waiting for DONE_WRITING. */
194 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
197 if (req) /* This is close request */
198 ptlrpc_req_finished(req);
202 int ll_md_real_close(struct inode *inode, int flags)
204 struct ll_inode_info *lli = ll_i2info(inode);
205 struct obd_client_handle **och_p;
206 struct obd_client_handle *och;
211 if (flags & FMODE_WRITE) {
212 och_p = &lli->lli_mds_write_och;
213 och_usecount = &lli->lli_open_fd_write_count;
214 } else if (flags & FMODE_EXEC) {
215 och_p = &lli->lli_mds_exec_och;
216 och_usecount = &lli->lli_open_fd_exec_count;
218 LASSERT(flags & FMODE_READ);
219 och_p = &lli->lli_mds_read_och;
220 och_usecount = &lli->lli_open_fd_read_count;
223 mutex_lock(&lli->lli_och_mutex);
224 if (*och_usecount) { /* There are still users of this handle, so
226 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
233 if (och) { /* There might be a race and somebody have freed this och
235 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
242 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
254 /* Let's see if we have good enough OPEN lock on the file and if
255 we can skip talking to MDS */
256 if (file->f_dentry->d_inode) { /* Can this ever be false? */
258 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
259 struct lustre_handle lockh;
260 struct inode *inode = file->f_dentry->d_inode;
261 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
263 mutex_lock(&lli->lli_och_mutex);
264 if (fd->fd_omode & FMODE_WRITE) {
266 LASSERT(lli->lli_open_fd_write_count);
267 lli->lli_open_fd_write_count--;
268 } else if (fd->fd_omode & FMODE_EXEC) {
270 LASSERT(lli->lli_open_fd_exec_count);
271 lli->lli_open_fd_exec_count--;
274 LASSERT(lli->lli_open_fd_read_count);
275 lli->lli_open_fd_read_count--;
277 mutex_unlock(&lli->lli_och_mutex);
279 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
280 LDLM_IBITS, &policy, lockmode,
282 rc = ll_md_real_close(file->f_dentry->d_inode,
286 CERROR("Releasing a file %p with negative dentry %p. Name %s",
287 file, file->f_dentry, file->f_dentry->d_name.name);
290 LUSTRE_FPRIVATE(file) = NULL;
291 ll_file_data_put(fd);
292 ll_capa_close(inode);
297 /* While this returns an error code, fput() the caller does not, so we need
298 * to make every effort to clean up all of our state here. Also, applications
299 * rarely check close errors and even if an error is returned they will not
300 * re-try the close call.
302 int ll_file_release(struct inode *inode, struct file *file)
304 struct ll_file_data *fd;
305 struct ll_sb_info *sbi = ll_i2sbi(inode);
306 struct ll_inode_info *lli = ll_i2info(inode);
310 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
311 inode->i_generation, inode);
313 #ifdef CONFIG_FS_POSIX_ACL
314 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
315 inode == inode->i_sb->s_root->d_inode) {
316 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
319 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
320 fd->fd_flags &= ~LL_FILE_RMTACL;
321 rct_del(&sbi->ll_rct, cfs_curproc_pid());
322 et_search_free(&sbi->ll_et, cfs_curproc_pid());
327 if (inode->i_sb->s_root != file->f_dentry)
328 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
329 fd = LUSTRE_FPRIVATE(file);
332 /* The last ref on @file, maybe not the the owner pid of statahead.
333 * Different processes can open the same dir, "ll_opendir_key" means:
334 * it is me that should stop the statahead thread. */
335 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
336 lli->lli_opendir_pid != 0)
337 ll_stop_statahead(inode, lli->lli_opendir_key);
339 if (inode->i_sb->s_root == file->f_dentry) {
340 LUSTRE_FPRIVATE(file) = NULL;
341 ll_file_data_put(fd);
345 if (!S_ISDIR(inode->i_mode)) {
346 lov_read_and_clear_async_rc(lli->lli_clob);
347 lli->lli_async_rc = 0;
350 rc = ll_md_close(sbi->ll_md_exp, inode, file);
352 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
353 libcfs_debug_dumplog();
358 static int ll_intent_file_open(struct file *file, void *lmm,
359 int lmmsize, struct lookup_intent *itp)
361 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
362 struct dentry *parent = file->f_dentry->d_parent;
363 const char *name = file->f_dentry->d_name.name;
364 const int len = file->f_dentry->d_name.len;
365 struct md_op_data *op_data;
366 struct ptlrpc_request *req;
367 __u32 opc = LUSTRE_OPC_ANY;
374 /* Usually we come here only for NFSD, and we want open lock.
375 But we can also get here with pre 2.6.15 patchless kernels, and in
376 that case that lock is also ok */
377 /* We can also get here if there was cached open handle in revalidate_it
378 * but it disappeared while we were getting from there to ll_file_open.
379 * But this means this file was closed and immediatelly opened which
380 * makes a good candidate for using OPEN lock */
381 /* If lmmsize & lmm are not 0, we are just setting stripe info
382 * parameters. No need for the open lock */
383 if (lmm == NULL && lmmsize == 0) {
384 itp->it_flags |= MDS_OPEN_LOCK;
385 if (itp->it_flags & FMODE_WRITE)
386 opc = LUSTRE_OPC_CREATE;
389 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
390 file->f_dentry->d_inode, name, len,
393 RETURN(PTR_ERR(op_data));
395 itp->it_flags |= MDS_OPEN_BY_FID;
396 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
397 0 /*unused */, &req, ll_md_blocking_ast, 0);
398 ll_finish_md_op_data(op_data);
400 /* reason for keep own exit path - don`t flood log
401 * with messages with -ESTALE errors.
403 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
404 it_open_error(DISP_OPEN_OPEN, itp))
406 ll_release_openhandle(file->f_dentry, itp);
410 if (it_disposition(itp, DISP_LOOKUP_NEG))
411 GOTO(out, rc = -ENOENT);
413 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
414 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
415 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
419 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
420 if (!rc && itp->d.lustre.it_lock_mode)
421 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
425 ptlrpc_req_finished(itp->d.lustre.it_data);
426 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
427 ll_intent_drop_lock(itp);
433 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
434 * not believe attributes if a few ioepoch holders exist. Attributes for
435 * previous ioepoch if new one is opened are also skipped by MDS.
437 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
439 if (ioepoch && lli->lli_ioepoch != ioepoch) {
440 lli->lli_ioepoch = ioepoch;
441 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
442 ioepoch, PFID(&lli->lli_fid));
446 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
447 struct lookup_intent *it, struct obd_client_handle *och)
449 struct ptlrpc_request *req = it->d.lustre.it_data;
450 struct mdt_body *body;
454 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
455 LASSERT(body != NULL); /* reply already checked out */
457 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
458 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
459 och->och_fid = lli->lli_fid;
460 och->och_flags = it->it_flags;
461 ll_ioepoch_open(lli, body->ioepoch);
463 return md_set_open_replay_data(md_exp, och, req);
466 int ll_local_open(struct file *file, struct lookup_intent *it,
467 struct ll_file_data *fd, struct obd_client_handle *och)
469 struct inode *inode = file->f_dentry->d_inode;
470 struct ll_inode_info *lli = ll_i2info(inode);
473 LASSERT(!LUSTRE_FPRIVATE(file));
478 struct ptlrpc_request *req = it->d.lustre.it_data;
479 struct mdt_body *body;
482 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
486 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
487 if ((it->it_flags & FMODE_WRITE) &&
488 (body->valid & OBD_MD_FLSIZE))
489 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
490 lli->lli_ioepoch, PFID(&lli->lli_fid));
493 LUSTRE_FPRIVATE(file) = fd;
494 ll_readahead_init(inode, &fd->fd_ras);
495 fd->fd_omode = it->it_flags;
499 /* Open a file, and (for the very first open) create objects on the OSTs at
500 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
501 * creation or open until ll_lov_setstripe() ioctl is called.
503 * If we already have the stripe MD locally then we don't request it in
504 * md_open(), by passing a lmm_size = 0.
506 * It is up to the application to ensure no other processes open this file
507 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
508 * used. We might be able to avoid races of that sort by getting lli_open_sem
509 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
510 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
512 int ll_file_open(struct inode *inode, struct file *file)
514 struct ll_inode_info *lli = ll_i2info(inode);
515 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
516 .it_flags = file->f_flags };
517 struct obd_client_handle **och_p = NULL;
518 __u64 *och_usecount = NULL;
519 struct ll_file_data *fd;
520 int rc = 0, opendir_set = 0;
523 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
524 inode->i_generation, inode, file->f_flags);
526 it = file->private_data; /* XXX: compat macro */
527 file->private_data = NULL; /* prevent ll_local_open assertion */
529 fd = ll_file_data_get();
531 GOTO(out_openerr, rc = -ENOMEM);
534 if (S_ISDIR(inode->i_mode)) {
535 spin_lock(&lli->lli_sa_lock);
536 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
537 lli->lli_opendir_pid == 0) {
538 lli->lli_opendir_key = fd;
539 lli->lli_opendir_pid = cfs_curproc_pid();
542 spin_unlock(&lli->lli_sa_lock);
545 if (inode->i_sb->s_root == file->f_dentry) {
546 LUSTRE_FPRIVATE(file) = fd;
550 if (!it || !it->d.lustre.it_disposition) {
551 /* Convert f_flags into access mode. We cannot use file->f_mode,
552 * because everything but O_ACCMODE mask was stripped from
554 if ((oit.it_flags + 1) & O_ACCMODE)
556 if (file->f_flags & O_TRUNC)
557 oit.it_flags |= FMODE_WRITE;
559 /* kernel only call f_op->open in dentry_open. filp_open calls
560 * dentry_open after call to open_namei that checks permissions.
561 * Only nfsd_open call dentry_open directly without checking
562 * permissions and because of that this code below is safe. */
563 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
564 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
566 /* We do not want O_EXCL here, presumably we opened the file
567 * already? XXX - NFS implications? */
568 oit.it_flags &= ~O_EXCL;
570 /* bug20584, if "it_flags" contains O_CREAT, the file will be
571 * created if necessary, then "IT_CREAT" should be set to keep
572 * consistent with it */
573 if (oit.it_flags & O_CREAT)
574 oit.it_op |= IT_CREAT;
580 /* Let's see if we have file open on MDS already. */
581 if (it->it_flags & FMODE_WRITE) {
582 och_p = &lli->lli_mds_write_och;
583 och_usecount = &lli->lli_open_fd_write_count;
584 } else if (it->it_flags & FMODE_EXEC) {
585 och_p = &lli->lli_mds_exec_och;
586 och_usecount = &lli->lli_open_fd_exec_count;
588 och_p = &lli->lli_mds_read_och;
589 och_usecount = &lli->lli_open_fd_read_count;
592 mutex_lock(&lli->lli_och_mutex);
593 if (*och_p) { /* Open handle is present */
594 if (it_disposition(it, DISP_OPEN_OPEN)) {
595 /* Well, there's extra open request that we do not need,
596 let's close it somehow. This will decref request. */
597 rc = it_open_error(DISP_OPEN_OPEN, it);
599 mutex_unlock(&lli->lli_och_mutex);
600 GOTO(out_openerr, rc);
603 ll_release_openhandle(file->f_dentry, it);
607 rc = ll_local_open(file, it, fd, NULL);
610 mutex_unlock(&lli->lli_och_mutex);
611 GOTO(out_openerr, rc);
614 LASSERT(*och_usecount == 0);
615 if (!it->d.lustre.it_disposition) {
616 /* We cannot just request lock handle now, new ELC code
617 means that one of other OPEN locks for this file
618 could be cancelled, and since blocking ast handler
619 would attempt to grab och_mutex as well, that would
620 result in a deadlock */
621 mutex_unlock(&lli->lli_och_mutex);
622 it->it_create_mode |= M_CHECK_STALE;
623 rc = ll_intent_file_open(file, NULL, 0, it);
624 it->it_create_mode &= ~M_CHECK_STALE;
626 GOTO(out_openerr, rc);
630 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
632 GOTO(out_och_free, rc = -ENOMEM);
636 /* md_intent_lock() didn't get a request ref if there was an
637 * open error, so don't do cleanup on the request here
639 /* XXX (green): Should not we bail out on any error here, not
640 * just open error? */
641 rc = it_open_error(DISP_OPEN_OPEN, it);
643 GOTO(out_och_free, rc);
645 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
647 rc = ll_local_open(file, it, fd, *och_p);
649 GOTO(out_och_free, rc);
651 mutex_unlock(&lli->lli_och_mutex);
654 /* Must do this outside lli_och_mutex lock to prevent deadlock where
655 different kind of OPEN lock for this same inode gets cancelled
656 by ldlm_cancel_lru */
657 if (!S_ISREG(inode->i_mode))
658 GOTO(out_och_free, rc);
662 if (!lli->lli_has_smd) {
663 if (file->f_flags & O_LOV_DELAY_CREATE ||
664 !(file->f_mode & FMODE_WRITE)) {
665 CDEBUG(D_INODE, "object creation was delayed\n");
666 GOTO(out_och_free, rc);
669 file->f_flags &= ~O_LOV_DELAY_CREATE;
670 GOTO(out_och_free, rc);
674 if (och_p && *och_p) {
675 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
676 *och_p = NULL; /* OBD_FREE writes some magic there */
679 mutex_unlock(&lli->lli_och_mutex);
682 if (opendir_set != 0)
683 ll_stop_statahead(inode, lli->lli_opendir_key);
685 ll_file_data_put(fd);
687 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
690 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
691 ptlrpc_req_finished(it->d.lustre.it_data);
692 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
698 /* Fills the obdo with the attributes for the lsm */
699 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
700 struct obd_capa *capa, struct obdo *obdo,
701 __u64 ioepoch, int sync)
703 struct ptlrpc_request_set *set;
704 struct obd_info oinfo = { { { 0 } } };
709 LASSERT(lsm != NULL);
713 oinfo.oi_oa->o_oi = lsm->lsm_oi;
714 oinfo.oi_oa->o_mode = S_IFREG;
715 oinfo.oi_oa->o_ioepoch = ioepoch;
716 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
717 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
718 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
719 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
720 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
721 OBD_MD_FLDATAVERSION;
722 oinfo.oi_capa = capa;
724 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
725 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
728 set = ptlrpc_prep_set();
730 CERROR("can't allocate ptlrpc set\n");
733 rc = obd_getattr_async(exp, &oinfo, set);
735 rc = ptlrpc_set_wait(set);
736 ptlrpc_set_destroy(set);
739 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
740 OBD_MD_FLATIME | OBD_MD_FLMTIME |
741 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
742 OBD_MD_FLDATAVERSION);
747 * Performs the getattr on the inode and updates its fields.
748 * If @sync != 0, perform the getattr under the server-side lock.
750 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
751 __u64 ioepoch, int sync)
753 struct obd_capa *capa = ll_mdscapa_get(inode);
754 struct lov_stripe_md *lsm;
758 lsm = ccc_inode_lsm_get(inode);
759 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
760 capa, obdo, ioepoch, sync);
763 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
765 obdo_refresh_inode(inode, obdo, obdo->o_valid);
766 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
767 " blksize %lu\n", POSTID(oi), i_size_read(inode),
768 (unsigned long long)inode->i_blocks,
769 (unsigned long)ll_inode_blksize(inode));
771 ccc_inode_lsm_put(inode, lsm);
775 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct cl_object *obj = lli->lli_clob;
779 struct cl_attr *attr = ccc_env_thread_attr(env);
785 ll_inode_size_lock(inode);
786 /* merge timestamps the most recently obtained from mds with
787 timestamps obtained from osts */
788 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
789 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
790 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
791 inode_init_lvb(inode, &lvb);
793 cl_object_attr_lock(obj);
794 rc = cl_object_attr_get(env, obj, attr);
795 cl_object_attr_unlock(obj);
798 if (lvb.lvb_atime < attr->cat_atime)
799 lvb.lvb_atime = attr->cat_atime;
800 if (lvb.lvb_ctime < attr->cat_ctime)
801 lvb.lvb_ctime = attr->cat_ctime;
802 if (lvb.lvb_mtime < attr->cat_mtime)
803 lvb.lvb_mtime = attr->cat_mtime;
805 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
806 PFID(&lli->lli_fid), attr->cat_size);
807 cl_isize_write_nolock(inode, attr->cat_size);
809 inode->i_blocks = attr->cat_blocks;
811 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
812 LTIME_S(inode->i_atime) = lvb.lvb_atime;
813 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
815 ll_inode_size_unlock(inode);
820 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
823 struct obdo obdo = { 0 };
826 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
828 st->st_size = obdo.o_size;
829 st->st_blocks = obdo.o_blocks;
830 st->st_mtime = obdo.o_mtime;
831 st->st_atime = obdo.o_atime;
832 st->st_ctime = obdo.o_ctime;
837 void ll_io_init(struct cl_io *io, const struct file *file, int write)
839 struct inode *inode = file->f_dentry->d_inode;
841 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
843 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
844 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
845 file->f_flags & O_DIRECT ||
848 io->ci_obj = ll_i2info(inode)->lli_clob;
849 io->ci_lockreq = CILR_MAYBE;
850 if (ll_file_nolock(file)) {
851 io->ci_lockreq = CILR_NEVER;
852 io->ci_no_srvlock = 1;
853 } else if (file->f_flags & O_APPEND) {
854 io->ci_lockreq = CILR_MANDATORY;
859 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
860 struct file *file, enum cl_io_type iot,
861 loff_t *ppos, size_t count)
863 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
864 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
870 io = ccc_env_thread_io(env);
871 ll_io_init(io, file, iot == CIT_WRITE);
873 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
874 struct vvp_io *vio = vvp_env_io(env);
875 struct ccc_io *cio = ccc_env_io(env);
876 int write_mutex_locked = 0;
878 cio->cui_fd = LUSTRE_FPRIVATE(file);
879 vio->cui_io_subtype = args->via_io_subtype;
881 switch (vio->cui_io_subtype) {
883 cio->cui_iov = args->u.normal.via_iov;
884 cio->cui_nrsegs = args->u.normal.via_nrsegs;
885 cio->cui_tot_nrsegs = cio->cui_nrsegs;
886 #ifndef HAVE_FILE_WRITEV
887 cio->cui_iocb = args->u.normal.via_iocb;
889 if ((iot == CIT_WRITE) &&
890 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
891 if (mutex_lock_interruptible(&lli->
893 GOTO(out, result = -ERESTARTSYS);
894 write_mutex_locked = 1;
895 } else if (iot == CIT_READ) {
896 down_read(&lli->lli_trunc_sem);
900 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
901 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
904 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
905 vio->u.splice.cui_flags = args->u.splice.via_flags;
908 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
911 result = cl_io_loop(env, io);
912 if (write_mutex_locked)
913 mutex_unlock(&lli->lli_write_mutex);
914 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
915 up_read(&lli->lli_trunc_sem);
917 /* cl_io_rw_init() handled IO */
918 result = io->ci_result;
921 if (io->ci_nob > 0) {
923 *ppos = io->u.ci_wr.wr.crw_pos;
928 /* If any bit been read/written (result != 0), we just return
929 * short read/write instead of restart io. */
930 if (result == 0 && io->ci_need_restart) {
931 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
932 iot == CIT_READ ? "read" : "write",
933 file->f_dentry->d_name.name, *ppos, count);
934 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
938 if (iot == CIT_READ) {
940 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
941 LPROC_LL_READ_BYTES, result);
942 } else if (iot == CIT_WRITE) {
944 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
945 LPROC_LL_WRITE_BYTES, result);
946 fd->fd_write_failed = false;
947 } else if (result != -ERESTARTSYS) {
948 fd->fd_write_failed = true;
957 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
959 static int ll_file_get_iov_count(const struct iovec *iov,
960 unsigned long *nr_segs, size_t *count)
965 for (seg = 0; seg < *nr_segs; seg++) {
966 const struct iovec *iv = &iov[seg];
969 * If any segment has a negative length, or the cumulative
970 * length ever wraps negative then return -EINVAL.
973 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
975 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
980 cnt -= iv->iov_len; /* This segment is no good */
987 #ifdef HAVE_FILE_READV
988 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
989 unsigned long nr_segs, loff_t *ppos)
992 struct vvp_io_args *args;
998 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1002 env = cl_env_get(&refcheck);
1004 RETURN(PTR_ERR(env));
1006 args = vvp_env_args(env, IO_NORMAL);
1007 args->u.normal.via_iov = (struct iovec *)iov;
1008 args->u.normal.via_nrsegs = nr_segs;
1010 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1011 cl_env_put(env, &refcheck);
1015 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1019 struct iovec *local_iov;
1024 env = cl_env_get(&refcheck);
1026 RETURN(PTR_ERR(env));
1028 local_iov = &vvp_env_info(env)->vti_local_iov;
1029 local_iov->iov_base = (void __user *)buf;
1030 local_iov->iov_len = count;
1031 result = ll_file_readv(file, local_iov, 1, ppos);
1032 cl_env_put(env, &refcheck);
1037 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1038 unsigned long nr_segs, loff_t pos)
1041 struct vvp_io_args *args;
1047 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1051 env = cl_env_get(&refcheck);
1053 RETURN(PTR_ERR(env));
1055 args = vvp_env_args(env, IO_NORMAL);
1056 args->u.normal.via_iov = (struct iovec *)iov;
1057 args->u.normal.via_nrsegs = nr_segs;
1058 args->u.normal.via_iocb = iocb;
1060 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1061 &iocb->ki_pos, count);
1062 cl_env_put(env, &refcheck);
1066 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1070 struct iovec *local_iov;
1071 struct kiocb *kiocb;
1076 env = cl_env_get(&refcheck);
1078 RETURN(PTR_ERR(env));
1080 local_iov = &vvp_env_info(env)->vti_local_iov;
1081 kiocb = &vvp_env_info(env)->vti_kiocb;
1082 local_iov->iov_base = (void __user *)buf;
1083 local_iov->iov_len = count;
1084 init_sync_kiocb(kiocb, file);
1085 kiocb->ki_pos = *ppos;
1086 kiocb->ki_left = count;
1088 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1089 *ppos = kiocb->ki_pos;
1091 cl_env_put(env, &refcheck);
1097 * Write to a file (through the page cache).
1099 #ifdef HAVE_FILE_WRITEV
1100 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1101 unsigned long nr_segs, loff_t *ppos)
1104 struct vvp_io_args *args;
1110 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1114 env = cl_env_get(&refcheck);
1116 RETURN(PTR_ERR(env));
1118 args = vvp_env_args(env, IO_NORMAL);
1119 args->u.normal.via_iov = (struct iovec *)iov;
1120 args->u.normal.via_nrsegs = nr_segs;
1122 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1123 cl_env_put(env, &refcheck);
1127 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1131 struct iovec *local_iov;
1136 env = cl_env_get(&refcheck);
1138 RETURN(PTR_ERR(env));
1140 local_iov = &vvp_env_info(env)->vti_local_iov;
1141 local_iov->iov_base = (void __user *)buf;
1142 local_iov->iov_len = count;
1144 result = ll_file_writev(file, local_iov, 1, ppos);
1145 cl_env_put(env, &refcheck);
1149 #else /* AIO stuff */
1150 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1151 unsigned long nr_segs, loff_t pos)
1154 struct vvp_io_args *args;
1160 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1164 env = cl_env_get(&refcheck);
1166 RETURN(PTR_ERR(env));
1168 args = vvp_env_args(env, IO_NORMAL);
1169 args->u.normal.via_iov = (struct iovec *)iov;
1170 args->u.normal.via_nrsegs = nr_segs;
1171 args->u.normal.via_iocb = iocb;
1173 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1174 &iocb->ki_pos, count);
1175 cl_env_put(env, &refcheck);
1179 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1183 struct iovec *local_iov;
1184 struct kiocb *kiocb;
1189 env = cl_env_get(&refcheck);
1191 RETURN(PTR_ERR(env));
1193 local_iov = &vvp_env_info(env)->vti_local_iov;
1194 kiocb = &vvp_env_info(env)->vti_kiocb;
1195 local_iov->iov_base = (void __user *)buf;
1196 local_iov->iov_len = count;
1197 init_sync_kiocb(kiocb, file);
1198 kiocb->ki_pos = *ppos;
1199 kiocb->ki_left = count;
1201 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1202 *ppos = kiocb->ki_pos;
1204 cl_env_put(env, &refcheck);
1210 * Send file content (through pagecache) somewhere with helper
1212 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1213 struct pipe_inode_info *pipe, size_t count,
1217 struct vvp_io_args *args;
1222 env = cl_env_get(&refcheck);
1224 RETURN(PTR_ERR(env));
1226 args = vvp_env_args(env, IO_SPLICE);
1227 args->u.splice.via_pipe = pipe;
1228 args->u.splice.via_flags = flags;
1230 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1231 cl_env_put(env, &refcheck);
1235 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1238 struct obd_export *exp = ll_i2dtexp(inode);
1239 struct obd_trans_info oti = { 0 };
1240 struct obdo *oa = NULL;
1243 struct lov_stripe_md *lsm = NULL, *lsm2;
1250 lsm = ccc_inode_lsm_get(inode);
1251 if (!lsm_has_objects(lsm))
1252 GOTO(out, rc = -ENOENT);
1254 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1255 (lsm->lsm_stripe_count));
1257 OBD_ALLOC_LARGE(lsm2, lsm_size);
1259 GOTO(out, rc = -ENOMEM);
1262 oa->o_nlink = ost_idx;
1263 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1264 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1265 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1266 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1267 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1268 memcpy(lsm2, lsm, lsm_size);
1269 ll_inode_size_lock(inode);
1270 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1271 ll_inode_size_unlock(inode);
1273 OBD_FREE_LARGE(lsm2, lsm_size);
1276 ccc_inode_lsm_put(inode, lsm);
1281 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1283 struct ll_recreate_obj ucreat;
1287 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1290 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1294 ostid_set_seq_mdt0(&oi);
1295 ostid_set_id(&oi, ucreat.lrc_id);
1296 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1299 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1306 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1309 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1312 fid_to_ostid(&fid, &oi);
1313 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1314 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1317 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1318 int flags, struct lov_user_md *lum, int lum_size)
1320 struct lov_stripe_md *lsm = NULL;
1321 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1325 lsm = ccc_inode_lsm_get(inode);
1327 ccc_inode_lsm_put(inode, lsm);
1328 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1333 ll_inode_size_lock(inode);
1334 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1337 rc = oit.d.lustre.it_status;
1339 GOTO(out_req_free, rc);
1341 ll_release_openhandle(file->f_dentry, &oit);
1344 ll_inode_size_unlock(inode);
1345 ll_intent_release(&oit);
1346 ccc_inode_lsm_put(inode, lsm);
1349 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1353 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1354 struct lov_mds_md **lmmp, int *lmm_size,
1355 struct ptlrpc_request **request)
1357 struct ll_sb_info *sbi = ll_i2sbi(inode);
1358 struct mdt_body *body;
1359 struct lov_mds_md *lmm = NULL;
1360 struct ptlrpc_request *req = NULL;
1361 struct md_op_data *op_data;
1364 rc = ll_get_max_mdsize(sbi, &lmmsize);
1368 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1369 strlen(filename), lmmsize,
1370 LUSTRE_OPC_ANY, NULL);
1371 if (IS_ERR(op_data))
1372 RETURN(PTR_ERR(op_data));
1374 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1375 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1376 ll_finish_md_op_data(op_data);
1378 CDEBUG(D_INFO, "md_getattr_name failed "
1379 "on %s: rc %d\n", filename, rc);
1383 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1384 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1386 lmmsize = body->eadatasize;
1388 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1390 GOTO(out, rc = -ENODATA);
1393 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1394 LASSERT(lmm != NULL);
1396 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1397 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1398 GOTO(out, rc = -EPROTO);
1402 * This is coming from the MDS, so is probably in
1403 * little endian. We convert it to host endian before
1404 * passing it to userspace.
1406 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1409 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1410 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1413 /* if function called for directory - we should
1414 * avoid swab not existent lsm objects */
1415 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1416 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1417 if (S_ISREG(body->mode))
1418 lustre_swab_lov_user_md_objects(
1419 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1421 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1422 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1423 if (S_ISREG(body->mode))
1424 lustre_swab_lov_user_md_objects(
1425 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1432 *lmm_size = lmmsize;
1437 static int ll_lov_setea(struct inode *inode, struct file *file,
1440 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1441 struct lov_user_md *lump;
1442 int lum_size = sizeof(struct lov_user_md) +
1443 sizeof(struct lov_user_ost_data);
1447 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1450 OBD_ALLOC_LARGE(lump, lum_size);
1454 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1455 OBD_FREE_LARGE(lump, lum_size);
1459 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1461 OBD_FREE_LARGE(lump, lum_size);
1465 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1468 struct lov_user_md_v3 lumv3;
1469 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1470 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1471 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1473 int flags = FMODE_WRITE;
1476 /* first try with v1 which is smaller than v3 */
1477 lum_size = sizeof(struct lov_user_md_v1);
1478 if (copy_from_user(lumv1, lumv1p, lum_size))
1481 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1482 lum_size = sizeof(struct lov_user_md_v3);
1483 if (copy_from_user(&lumv3, lumv3p, lum_size))
1487 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1489 struct lov_stripe_md *lsm;
1492 put_user(0, &lumv1p->lmm_stripe_count);
1494 ll_layout_refresh(inode, &gen);
1495 lsm = ccc_inode_lsm_get(inode);
1496 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1497 0, lsm, (void *)arg);
1498 ccc_inode_lsm_put(inode, lsm);
1503 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1505 struct lov_stripe_md *lsm;
1509 lsm = ccc_inode_lsm_get(inode);
1511 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1513 ccc_inode_lsm_put(inode, lsm);
1517 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1519 struct ll_inode_info *lli = ll_i2info(inode);
1520 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1521 struct ccc_grouplock grouplock;
1525 if (ll_file_nolock(file))
1526 RETURN(-EOPNOTSUPP);
1528 spin_lock(&lli->lli_lock);
1529 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1530 CWARN("group lock already existed with gid %lu\n",
1531 fd->fd_grouplock.cg_gid);
1532 spin_unlock(&lli->lli_lock);
1535 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1536 spin_unlock(&lli->lli_lock);
1538 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1539 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1543 spin_lock(&lli->lli_lock);
1544 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1545 spin_unlock(&lli->lli_lock);
1546 CERROR("another thread just won the race\n");
1547 cl_put_grouplock(&grouplock);
1551 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1552 fd->fd_grouplock = grouplock;
1553 spin_unlock(&lli->lli_lock);
1555 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1559 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1561 struct ll_inode_info *lli = ll_i2info(inode);
1562 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1563 struct ccc_grouplock grouplock;
1566 spin_lock(&lli->lli_lock);
1567 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1568 spin_unlock(&lli->lli_lock);
1569 CWARN("no group lock held\n");
1572 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1574 if (fd->fd_grouplock.cg_gid != arg) {
1575 CWARN("group lock %lu doesn't match current id %lu\n",
1576 arg, fd->fd_grouplock.cg_gid);
1577 spin_unlock(&lli->lli_lock);
1581 grouplock = fd->fd_grouplock;
1582 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1583 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1584 spin_unlock(&lli->lli_lock);
1586 cl_put_grouplock(&grouplock);
1587 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1592 * Close inode open handle
1594 * \param dentry [in] dentry which contains the inode
1595 * \param it [in,out] intent which contains open info and result
1598 * \retval <0 failure
1600 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1602 struct inode *inode = dentry->d_inode;
1603 struct obd_client_handle *och;
1609 /* Root ? Do nothing. */
1610 if (dentry->d_inode->i_sb->s_root == dentry)
1613 /* No open handle to close? Move away */
1614 if (!it_disposition(it, DISP_OPEN_OPEN))
1617 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1619 OBD_ALLOC(och, sizeof(*och));
1621 GOTO(out, rc = -ENOMEM);
1623 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1624 ll_i2info(inode), it, och);
1626 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1629 /* this one is in place of ll_file_open */
1630 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1631 ptlrpc_req_finished(it->d.lustre.it_data);
1632 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1638 * Get size for inode for which FIEMAP mapping is requested.
1639 * Make the FIEMAP get_info call and returns the result.
1641 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1644 struct obd_export *exp = ll_i2dtexp(inode);
1645 struct lov_stripe_md *lsm = NULL;
1646 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1647 int vallen = num_bytes;
1651 /* Checks for fiemap flags */
1652 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1653 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1657 /* Check for FIEMAP_FLAG_SYNC */
1658 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1659 rc = filemap_fdatawrite(inode->i_mapping);
1664 lsm = ccc_inode_lsm_get(inode);
1668 /* If the stripe_count > 1 and the application does not understand
1669 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1671 if (lsm->lsm_stripe_count > 1 &&
1672 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1673 GOTO(out, rc = -EOPNOTSUPP);
1675 fm_key.oa.o_oi = lsm->lsm_oi;
1676 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1678 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1679 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1680 /* If filesize is 0, then there would be no objects for mapping */
1681 if (fm_key.oa.o_size == 0) {
1682 fiemap->fm_mapped_extents = 0;
1686 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1688 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1691 CERROR("obd_get_info failed: rc = %d\n", rc);
1694 ccc_inode_lsm_put(inode, lsm);
1698 int ll_fid2path(struct inode *inode, void *arg)
1700 struct obd_export *exp = ll_i2mdexp(inode);
1701 struct getinfo_fid2path *gfout, *gfin;
1705 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1706 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1709 /* Need to get the buflen */
1710 OBD_ALLOC_PTR(gfin);
1713 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1718 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1719 OBD_ALLOC(gfout, outsize);
1720 if (gfout == NULL) {
1724 memcpy(gfout, gfin, sizeof(*gfout));
1727 /* Call mdc_iocontrol */
1728 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1732 if (copy_to_user(arg, gfout, outsize))
1736 OBD_FREE(gfout, outsize);
1740 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1742 struct ll_user_fiemap *fiemap_s;
1743 size_t num_bytes, ret_bytes;
1744 unsigned int extent_count;
1747 /* Get the extent count so we can calculate the size of
1748 * required fiemap buffer */
1749 if (get_user(extent_count,
1750 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1752 num_bytes = sizeof(*fiemap_s) + (extent_count *
1753 sizeof(struct ll_fiemap_extent));
1755 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1756 if (fiemap_s == NULL)
1759 /* get the fiemap value */
1760 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1762 GOTO(error, rc = -EFAULT);
1764 /* If fm_extent_count is non-zero, read the first extent since
1765 * it is used to calculate end_offset and device from previous
1768 if (copy_from_user(&fiemap_s->fm_extents[0],
1769 (char __user *)arg + sizeof(*fiemap_s),
1770 sizeof(struct ll_fiemap_extent)))
1771 GOTO(error, rc = -EFAULT);
1774 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1778 ret_bytes = sizeof(struct ll_user_fiemap);
1780 if (extent_count != 0)
1781 ret_bytes += (fiemap_s->fm_mapped_extents *
1782 sizeof(struct ll_fiemap_extent));
1784 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1788 OBD_FREE_LARGE(fiemap_s, num_bytes);
1793 * Read the data_version for inode.
1795 * This value is computed using stripe object version on OST.
1796 * Version is computed using server side locking.
1798 * @param extent_lock Take extent lock. Not needed if a process is already
1799 * holding the OST object group locks.
1801 int ll_data_version(struct inode *inode, __u64 *data_version,
1804 struct lov_stripe_md *lsm = NULL;
1805 struct ll_sb_info *sbi = ll_i2sbi(inode);
1806 struct obdo *obdo = NULL;
1810 /* If no stripe, we consider version is 0. */
1811 lsm = ccc_inode_lsm_get(inode);
1812 if (!lsm_has_objects(lsm)) {
1814 CDEBUG(D_INODE, "No object for inode\n");
1818 OBD_ALLOC_PTR(obdo);
1820 GOTO(out, rc = -ENOMEM);
1822 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1824 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1827 *data_version = obdo->o_data_version;
1833 ccc_inode_lsm_put(inode, lsm);
1837 struct ll_swap_stack {
1838 struct iattr ia1, ia2;
1840 struct inode *inode1, *inode2;
1841 bool check_dv1, check_dv2;
1844 static int ll_swap_layouts(struct file *file1, struct file *file2,
1845 struct lustre_swap_layouts *lsl)
1847 struct mdc_swap_layouts msl;
1848 struct md_op_data *op_data;
1851 struct ll_swap_stack *llss = NULL;
1854 OBD_ALLOC_PTR(llss);
1858 llss->inode1 = file1->f_dentry->d_inode;
1859 llss->inode2 = file2->f_dentry->d_inode;
1861 if (!S_ISREG(llss->inode2->i_mode))
1862 GOTO(free, rc = -EINVAL);
1864 if (inode_permission(llss->inode1, MAY_WRITE) ||
1865 inode_permission(llss->inode2, MAY_WRITE))
1866 GOTO(free, rc = -EPERM);
1868 if (llss->inode2->i_sb != llss->inode1->i_sb)
1869 GOTO(free, rc = -EXDEV);
1871 /* we use 2 bool because it is easier to swap than 2 bits */
1872 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1873 llss->check_dv1 = true;
1875 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1876 llss->check_dv2 = true;
1878 /* we cannot use lsl->sl_dvX directly because we may swap them */
1879 llss->dv1 = lsl->sl_dv1;
1880 llss->dv2 = lsl->sl_dv2;
1882 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1883 if (rc == 0) /* same file, done! */
1886 if (rc < 0) { /* sequentialize it */
1887 swap(llss->inode1, llss->inode2);
1889 swap(llss->dv1, llss->dv2);
1890 swap(llss->check_dv1, llss->check_dv2);
1894 if (gid != 0) { /* application asks to flush dirty cache */
1895 rc = ll_get_grouplock(llss->inode1, file1, gid);
1899 rc = ll_get_grouplock(llss->inode2, file2, gid);
1901 ll_put_grouplock(llss->inode1, file1, gid);
1906 /* to be able to restore mtime and atime after swap
1907 * we need to first save them */
1909 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1910 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1911 llss->ia1.ia_atime = llss->inode1->i_atime;
1912 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1913 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1914 llss->ia2.ia_atime = llss->inode2->i_atime;
1915 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1918 /* ultimate check, before swaping the layouts we check if
1919 * dataversion has changed (if requested) */
1920 if (llss->check_dv1) {
1921 rc = ll_data_version(llss->inode1, &dv, 0);
1924 if (dv != llss->dv1)
1925 GOTO(putgl, rc = -EAGAIN);
1928 if (llss->check_dv2) {
1929 rc = ll_data_version(llss->inode2, &dv, 0);
1932 if (dv != llss->dv2)
1933 GOTO(putgl, rc = -EAGAIN);
1936 /* struct md_op_data is used to send the swap args to the mdt
1937 * only flags is missing, so we use struct mdc_swap_layouts
1938 * through the md_op_data->op_data */
1939 /* flags from user space have to be converted before they are send to
1940 * server, no flag is sent today, they are only used on the client */
1943 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1944 0, LUSTRE_OPC_ANY, &msl);
1945 if (IS_ERR(op_data))
1946 GOTO(free, rc = PTR_ERR(op_data));
1948 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1949 sizeof(*op_data), op_data, NULL);
1950 ll_finish_md_op_data(op_data);
1954 ll_put_grouplock(llss->inode2, file2, gid);
1955 ll_put_grouplock(llss->inode1, file1, gid);
1958 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1962 /* clear useless flags */
1963 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1964 llss->ia1.ia_valid &= ~ATTR_MTIME;
1965 llss->ia2.ia_valid &= ~ATTR_MTIME;
1968 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1969 llss->ia1.ia_valid &= ~ATTR_ATIME;
1970 llss->ia2.ia_valid &= ~ATTR_ATIME;
1973 /* update time if requested */
1975 if (llss->ia2.ia_valid != 0) {
1976 mutex_lock(&llss->inode1->i_mutex);
1977 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1978 mutex_unlock(&llss->inode1->i_mutex);
1981 if (llss->ia1.ia_valid != 0) {
1984 mutex_lock(&llss->inode2->i_mutex);
1985 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
1986 mutex_unlock(&llss->inode2->i_mutex);
1998 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2000 struct inode *inode = file->f_dentry->d_inode;
2001 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2005 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2006 inode->i_generation, inode, cmd);
2007 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2009 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2010 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2014 case LL_IOC_GETFLAGS:
2015 /* Get the current value of the file flags */
2016 return put_user(fd->fd_flags, (int *)arg);
2017 case LL_IOC_SETFLAGS:
2018 case LL_IOC_CLRFLAGS:
2019 /* Set or clear specific file flags */
2020 /* XXX This probably needs checks to ensure the flags are
2021 * not abused, and to handle any flag side effects.
2023 if (get_user(flags, (int *) arg))
2026 if (cmd == LL_IOC_SETFLAGS) {
2027 if ((flags & LL_FILE_IGNORE_LOCK) &&
2028 !(file->f_flags & O_DIRECT)) {
2029 CERROR("%s: unable to disable locking on "
2030 "non-O_DIRECT file\n", current->comm);
2034 fd->fd_flags |= flags;
2036 fd->fd_flags &= ~flags;
2039 case LL_IOC_LOV_SETSTRIPE:
2040 RETURN(ll_lov_setstripe(inode, file, arg));
2041 case LL_IOC_LOV_SETEA:
2042 RETURN(ll_lov_setea(inode, file, arg));
2043 case LL_IOC_LOV_SWAP_LAYOUTS: {
2045 struct lustre_swap_layouts lsl;
2047 if (copy_from_user(&lsl, (char *)arg,
2048 sizeof(struct lustre_swap_layouts)))
2051 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2054 file2 = fget(lsl.sl_fd);
2059 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2060 rc = ll_swap_layouts(file, file2, &lsl);
2064 case LL_IOC_LOV_GETSTRIPE:
2065 RETURN(ll_lov_getstripe(inode, arg));
2066 case LL_IOC_RECREATE_OBJ:
2067 RETURN(ll_lov_recreate_obj(inode, arg));
2068 case LL_IOC_RECREATE_FID:
2069 RETURN(ll_lov_recreate_fid(inode, arg));
2070 case FSFILT_IOC_FIEMAP:
2071 RETURN(ll_ioctl_fiemap(inode, arg));
2072 case FSFILT_IOC_GETFLAGS:
2073 case FSFILT_IOC_SETFLAGS:
2074 RETURN(ll_iocontrol(inode, file, cmd, arg));
2075 case FSFILT_IOC_GETVERSION_OLD:
2076 case FSFILT_IOC_GETVERSION:
2077 RETURN(put_user(inode->i_generation, (int *)arg));
2078 case LL_IOC_GROUP_LOCK:
2079 RETURN(ll_get_grouplock(inode, file, arg));
2080 case LL_IOC_GROUP_UNLOCK:
2081 RETURN(ll_put_grouplock(inode, file, arg));
2082 case IOC_OBD_STATFS:
2083 RETURN(ll_obd_statfs(inode, (void *)arg));
2085 /* We need to special case any other ioctls we want to handle,
2086 * to send them to the MDS/OST as appropriate and to properly
2087 * network encode the arg field.
2088 case FSFILT_IOC_SETVERSION_OLD:
2089 case FSFILT_IOC_SETVERSION:
2091 case LL_IOC_FLUSHCTX:
2092 RETURN(ll_flush_ctx(inode));
2093 case LL_IOC_PATH2FID: {
2094 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2095 sizeof(struct lu_fid)))
2100 case OBD_IOC_FID2PATH:
2101 RETURN(ll_fid2path(inode, (void *)arg));
2102 case LL_IOC_DATA_VERSION: {
2103 struct ioc_data_version idv;
2106 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2109 rc = ll_data_version(inode, &idv.idv_version,
2110 !(idv.idv_flags & LL_DV_NOFLUSH));
2112 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2118 case LL_IOC_GET_MDTIDX: {
2121 mdtidx = ll_get_mdt_idx(inode);
2125 if (put_user((int)mdtidx, (int*)arg))
2130 case OBD_IOC_GETDTNAME:
2131 case OBD_IOC_GETMDNAME:
2132 RETURN(ll_get_obd_name(inode, cmd, arg));
2133 case LL_IOC_HSM_STATE_GET: {
2134 struct md_op_data *op_data;
2135 struct hsm_user_state *hus;
2142 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2143 LUSTRE_OPC_ANY, hus);
2144 if (IS_ERR(op_data)) {
2146 RETURN(PTR_ERR(op_data));
2149 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2152 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2155 ll_finish_md_op_data(op_data);
2159 case LL_IOC_HSM_STATE_SET: {
2160 struct md_op_data *op_data;
2161 struct hsm_state_set *hss;
2167 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2172 /* Non-root users are forbidden to set or clear flags which are
2173 * NOT defined in HSM_USER_MASK. */
2174 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2175 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2180 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2181 LUSTRE_OPC_ANY, hss);
2182 if (IS_ERR(op_data)) {
2184 RETURN(PTR_ERR(op_data));
2187 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2190 ll_finish_md_op_data(op_data);
2195 case LL_IOC_HSM_ACTION: {
2196 struct md_op_data *op_data;
2197 struct hsm_current_action *hca;
2204 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2205 LUSTRE_OPC_ANY, hca);
2206 if (IS_ERR(op_data)) {
2208 RETURN(PTR_ERR(op_data));
2211 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2214 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2217 ll_finish_md_op_data(op_data);
2225 ll_iocontrol_call(inode, file, cmd, arg, &err))
2228 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2234 #ifndef HAVE_FILE_LLSEEK_SIZE
2235 static inline loff_t
2236 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2238 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2240 if (offset > maxsize)
2243 if (offset != file->f_pos) {
2244 file->f_pos = offset;
2245 file->f_version = 0;
2251 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2252 loff_t maxsize, loff_t eof)
2254 struct inode *inode = file->f_dentry->d_inode;
2262 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2263 * position-querying operation. Avoid rewriting the "same"
2264 * f_pos value back to the file because a concurrent read(),
2265 * write() or lseek() might have altered it
2270 * f_lock protects against read/modify/write race with other
2271 * SEEK_CURs. Note that parallel writes and reads behave
2274 mutex_lock(&inode->i_mutex);
2275 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2276 mutex_unlock(&inode->i_mutex);
2280 * In the generic case the entire file is data, so as long as
2281 * offset isn't at the end of the file then the offset is data.
2288 * There is a virtual hole at the end of the file, so as long as
2289 * offset isn't i_size or larger, return i_size.
2297 return llseek_execute(file, offset, maxsize);
2301 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2303 struct inode *inode = file->f_dentry->d_inode;
2304 loff_t retval, eof = 0;
2307 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2308 (origin == SEEK_CUR) ? file->f_pos : 0);
2309 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2310 inode->i_ino, inode->i_generation, inode, retval, retval,
2312 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2314 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2315 retval = ll_glimpse_size(inode);
2318 eof = i_size_read(inode);
2321 retval = ll_generic_file_llseek_size(file, offset, origin,
2322 ll_file_maxbytes(inode), eof);
2326 int ll_flush(struct file *file, fl_owner_t id)
2328 struct inode *inode = file->f_dentry->d_inode;
2329 struct ll_inode_info *lli = ll_i2info(inode);
2330 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2333 LASSERT(!S_ISDIR(inode->i_mode));
2335 /* catch async errors that were recorded back when async writeback
2336 * failed for pages in this mapping. */
2337 rc = lli->lli_async_rc;
2338 lli->lli_async_rc = 0;
2339 err = lov_read_and_clear_async_rc(lli->lli_clob);
2343 /* The application has been told write failure already.
2344 * Do not report failure again. */
2345 if (fd->fd_write_failed)
2347 return rc ? -EIO : 0;
2351 * Called to make sure a portion of file has been written out.
2352 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2354 * Return how many pages have been written.
2356 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2357 enum cl_fsync_mode mode, int ignore_layout)
2359 struct cl_env_nest nest;
2362 struct obd_capa *capa = NULL;
2363 struct cl_fsync_io *fio;
2367 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2368 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2371 env = cl_env_nested_get(&nest);
2373 RETURN(PTR_ERR(env));
2375 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2377 io = ccc_env_thread_io(env);
2378 io->ci_obj = cl_i2info(inode)->lli_clob;
2379 io->ci_ignore_layout = ignore_layout;
2381 /* initialize parameters for sync */
2382 fio = &io->u.ci_fsync;
2383 fio->fi_capa = capa;
2384 fio->fi_start = start;
2386 fio->fi_fid = ll_inode2fid(inode);
2387 fio->fi_mode = mode;
2388 fio->fi_nr_written = 0;
2390 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2391 result = cl_io_loop(env, io);
2393 result = io->ci_result;
2395 result = fio->fi_nr_written;
2396 cl_io_fini(env, io);
2397 cl_env_nested_put(&nest, env);
2405 * When dentry is provided (the 'else' case), *file->f_dentry may be
2406 * null and dentry must be used directly rather than pulled from
2407 * *file->f_dentry as is done otherwise.
2410 #ifdef HAVE_FILE_FSYNC_4ARGS
2411 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2413 struct dentry *dentry = file->f_dentry;
2414 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2415 int ll_fsync(struct file *file, int datasync)
2417 struct dentry *dentry = file->f_dentry;
2419 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2422 struct inode *inode = dentry->d_inode;
2423 struct ll_inode_info *lli = ll_i2info(inode);
2424 struct ptlrpc_request *req;
2425 struct obd_capa *oc;
2429 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2430 inode->i_generation, inode);
2431 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2433 #ifdef HAVE_FILE_FSYNC_4ARGS
2434 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2435 mutex_lock(&inode->i_mutex);
2437 /* fsync's caller has already called _fdata{sync,write}, we want
2438 * that IO to finish before calling the osc and mdc sync methods */
2439 rc = filemap_fdatawait(inode->i_mapping);
2442 /* catch async errors that were recorded back when async writeback
2443 * failed for pages in this mapping. */
2444 if (!S_ISDIR(inode->i_mode)) {
2445 err = lli->lli_async_rc;
2446 lli->lli_async_rc = 0;
2449 err = lov_read_and_clear_async_rc(lli->lli_clob);
2454 oc = ll_mdscapa_get(inode);
2455 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2461 ptlrpc_req_finished(req);
2463 if (datasync && S_ISREG(inode->i_mode)) {
2464 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2466 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2468 if (rc == 0 && err < 0)
2471 fd->fd_write_failed = true;
2473 fd->fd_write_failed = false;
2476 #ifdef HAVE_FILE_FSYNC_4ARGS
2477 mutex_unlock(&inode->i_mutex);
2482 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2484 struct inode *inode = file->f_dentry->d_inode;
2485 struct ll_sb_info *sbi = ll_i2sbi(inode);
2486 struct ldlm_enqueue_info einfo = {
2487 .ei_type = LDLM_FLOCK,
2488 .ei_cb_cp = ldlm_flock_completion_ast,
2489 .ei_cbdata = file_lock,
2491 struct md_op_data *op_data;
2492 struct lustre_handle lockh = {0};
2493 ldlm_policy_data_t flock = {{0}};
2499 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2500 inode->i_ino, file_lock);
2502 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2504 if (file_lock->fl_flags & FL_FLOCK) {
2505 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2506 /* flocks are whole-file locks */
2507 flock.l_flock.end = OFFSET_MAX;
2508 /* For flocks owner is determined by the local file desctiptor*/
2509 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2510 } else if (file_lock->fl_flags & FL_POSIX) {
2511 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2512 flock.l_flock.start = file_lock->fl_start;
2513 flock.l_flock.end = file_lock->fl_end;
2517 flock.l_flock.pid = file_lock->fl_pid;
2519 /* Somewhat ugly workaround for svc lockd.
2520 * lockd installs custom fl_lmops->lm_compare_owner that checks
2521 * for the fl_owner to be the same (which it always is on local node
2522 * I guess between lockd processes) and then compares pid.
2523 * As such we assign pid to the owner field to make it all work,
2524 * conflict with normal locks is unlikely since pid space and
2525 * pointer space for current->files are not intersecting */
2526 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2527 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2529 switch (file_lock->fl_type) {
2531 einfo.ei_mode = LCK_PR;
2534 /* An unlock request may or may not have any relation to
2535 * existing locks so we may not be able to pass a lock handle
2536 * via a normal ldlm_lock_cancel() request. The request may even
2537 * unlock a byte range in the middle of an existing lock. In
2538 * order to process an unlock request we need all of the same
2539 * information that is given with a normal read or write record
2540 * lock request. To avoid creating another ldlm unlock (cancel)
2541 * message we'll treat a LCK_NL flock request as an unlock. */
2542 einfo.ei_mode = LCK_NL;
2545 einfo.ei_mode = LCK_PW;
2548 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2549 file_lock->fl_type);
2564 flags = LDLM_FL_BLOCK_NOWAIT;
2570 flags = LDLM_FL_TEST_LOCK;
2571 /* Save the old mode so that if the mode in the lock changes we
2572 * can decrement the appropriate reader or writer refcount. */
2573 file_lock->fl_type = einfo.ei_mode;
2576 CERROR("unknown fcntl lock command: %d\n", cmd);
2580 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2581 LUSTRE_OPC_ANY, NULL);
2582 if (IS_ERR(op_data))
2583 RETURN(PTR_ERR(op_data));
2585 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2586 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2587 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2589 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2590 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2592 if ((file_lock->fl_flags & FL_FLOCK) &&
2593 (rc == 0 || file_lock->fl_type == F_UNLCK))
2594 rc2 = flock_lock_file_wait(file, file_lock);
2595 if ((file_lock->fl_flags & FL_POSIX) &&
2596 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2597 !(flags & LDLM_FL_TEST_LOCK))
2598 rc2 = posix_lock_file_wait(file, file_lock);
2600 if (rc2 && file_lock->fl_type != F_UNLCK) {
2601 einfo.ei_mode = LCK_NL;
2602 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2603 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2607 ll_finish_md_op_data(op_data);
2612 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2620 * test if some locks matching bits and l_req_mode are acquired
2621 * - bits can be in different locks
2622 * - if found clear the common lock bits in *bits
2623 * - the bits not found, are kept in *bits
2625 * \param bits [IN] searched lock bits [IN]
2626 * \param l_req_mode [IN] searched lock mode
2627 * \retval boolean, true iff all bits are found
2629 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2631 struct lustre_handle lockh;
2632 ldlm_policy_data_t policy;
2633 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2634 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2643 fid = &ll_i2info(inode)->lli_fid;
2644 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2645 ldlm_lockname[mode]);
2647 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2648 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2649 policy.l_inodebits.bits = *bits & (1 << i);
2650 if (policy.l_inodebits.bits == 0)
2653 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2654 &policy, mode, &lockh)) {
2655 struct ldlm_lock *lock;
2657 lock = ldlm_handle2lock(&lockh);
2660 ~(lock->l_policy_data.l_inodebits.bits);
2661 LDLM_LOCK_PUT(lock);
2663 *bits &= ~policy.l_inodebits.bits;
2670 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2671 struct lustre_handle *lockh, __u64 flags)
2673 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2678 fid = &ll_i2info(inode)->lli_fid;
2679 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2681 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2682 fid, LDLM_IBITS, &policy,
2683 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2687 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2689 /* Already unlinked. Just update nlink and return success */
2690 if (rc == -ENOENT) {
2692 /* This path cannot be hit for regular files unless in
2693 * case of obscure races, so no need to to validate
2695 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2697 } else if (rc != 0) {
2698 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2699 ll_get_fsname(inode->i_sb, NULL, 0),
2700 PFID(ll_inode2fid(inode)), rc);
2706 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2709 struct inode *inode = dentry->d_inode;
2710 struct ptlrpc_request *req = NULL;
2711 struct obd_export *exp;
2715 LASSERT(inode != NULL);
2717 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2718 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2720 exp = ll_i2mdexp(inode);
2722 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2723 * But under CMD case, it caused some lock issues, should be fixed
2724 * with new CMD ibits lock. See bug 12718 */
2725 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2726 struct lookup_intent oit = { .it_op = IT_GETATTR };
2727 struct md_op_data *op_data;
2729 if (ibits == MDS_INODELOCK_LOOKUP)
2730 oit.it_op = IT_LOOKUP;
2732 /* Call getattr by fid, so do not provide name at all. */
2733 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2734 dentry->d_inode, NULL, 0, 0,
2735 LUSTRE_OPC_ANY, NULL);
2736 if (IS_ERR(op_data))
2737 RETURN(PTR_ERR(op_data));
2739 oit.it_create_mode |= M_CHECK_STALE;
2740 rc = md_intent_lock(exp, op_data, NULL, 0,
2741 /* we are not interested in name
2744 ll_md_blocking_ast, 0);
2745 ll_finish_md_op_data(op_data);
2746 oit.it_create_mode &= ~M_CHECK_STALE;
2748 rc = ll_inode_revalidate_fini(inode, rc);
2752 rc = ll_revalidate_it_finish(req, &oit, dentry);
2754 ll_intent_release(&oit);
2758 /* Unlinked? Unhash dentry, so it is not picked up later by
2759 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2760 here to preserve get_cwd functionality on 2.6.
2762 if (!dentry->d_inode->i_nlink)
2763 d_lustre_invalidate(dentry, 0);
2765 ll_lookup_finish_locks(&oit, dentry);
2766 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2767 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2768 obd_valid valid = OBD_MD_FLGETATTR;
2769 struct md_op_data *op_data;
2772 if (S_ISREG(inode->i_mode)) {
2773 rc = ll_get_max_mdsize(sbi, &ealen);
2776 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2779 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2780 0, ealen, LUSTRE_OPC_ANY,
2782 if (IS_ERR(op_data))
2783 RETURN(PTR_ERR(op_data));
2785 op_data->op_valid = valid;
2786 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2787 * capa for this inode. Because we only keep capas of dirs
2789 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2790 ll_finish_md_op_data(op_data);
2792 rc = ll_inode_revalidate_fini(inode, rc);
2796 rc = ll_prep_inode(&inode, req, NULL, NULL);
2799 ptlrpc_req_finished(req);
2803 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2806 struct inode *inode = dentry->d_inode;
2810 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2814 /* if object isn't regular file, don't validate size */
2815 if (!S_ISREG(inode->i_mode)) {
2816 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2817 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2818 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2820 rc = ll_glimpse_size(inode);
2825 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2826 struct lookup_intent *it, struct kstat *stat)
2828 struct inode *inode = de->d_inode;
2829 struct ll_sb_info *sbi = ll_i2sbi(inode);
2830 struct ll_inode_info *lli = ll_i2info(inode);
2833 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2834 MDS_INODELOCK_LOOKUP);
2835 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2840 stat->dev = inode->i_sb->s_dev;
2841 if (ll_need_32bit_api(sbi))
2842 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2844 stat->ino = inode->i_ino;
2845 stat->mode = inode->i_mode;
2846 stat->nlink = inode->i_nlink;
2847 stat->uid = inode->i_uid;
2848 stat->gid = inode->i_gid;
2849 stat->rdev = inode->i_rdev;
2850 stat->atime = inode->i_atime;
2851 stat->mtime = inode->i_mtime;
2852 stat->ctime = inode->i_ctime;
2853 stat->blksize = 1 << inode->i_blkbits;
2855 stat->size = i_size_read(inode);
2856 stat->blocks = inode->i_blocks;
2860 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2862 struct lookup_intent it = { .it_op = IT_GETATTR };
2864 return ll_getattr_it(mnt, de, &it, stat);
2867 #ifdef HAVE_LINUX_FIEMAP_H
2868 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2869 __u64 start, __u64 len)
2873 struct ll_user_fiemap *fiemap;
2874 unsigned int extent_count = fieinfo->fi_extents_max;
2876 num_bytes = sizeof(*fiemap) + (extent_count *
2877 sizeof(struct ll_fiemap_extent));
2878 OBD_ALLOC_LARGE(fiemap, num_bytes);
2883 fiemap->fm_flags = fieinfo->fi_flags;
2884 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2885 fiemap->fm_start = start;
2886 fiemap->fm_length = len;
2887 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2888 sizeof(struct ll_fiemap_extent));
2890 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2892 fieinfo->fi_flags = fiemap->fm_flags;
2893 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2894 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2895 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2897 OBD_FREE_LARGE(fiemap, num_bytes);
2902 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2904 struct ll_inode_info *lli = ll_i2info(inode);
2905 struct posix_acl *acl = NULL;
2908 spin_lock(&lli->lli_lock);
2909 /* VFS' acl_permission_check->check_acl will release the refcount */
2910 acl = posix_acl_dup(lli->lli_posix_acl);
2911 spin_unlock(&lli->lli_lock);
2916 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2918 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2919 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2921 ll_check_acl(struct inode *inode, int mask)
2924 # ifdef CONFIG_FS_POSIX_ACL
2925 struct posix_acl *acl;
2929 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2930 if (flags & IPERM_FLAG_RCU)
2933 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2938 rc = posix_acl_permission(inode, acl, mask);
2939 posix_acl_release(acl);
2942 # else /* !CONFIG_FS_POSIX_ACL */
2944 # endif /* CONFIG_FS_POSIX_ACL */
2946 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2948 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2949 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2951 # ifdef HAVE_INODE_PERMISION_2ARGS
2952 int ll_inode_permission(struct inode *inode, int mask)
2954 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2961 #ifdef MAY_NOT_BLOCK
2962 if (mask & MAY_NOT_BLOCK)
2964 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2965 if (flags & IPERM_FLAG_RCU)
2969 /* as root inode are NOT getting validated in lookup operation,
2970 * need to do it before permission check. */
2972 if (inode == inode->i_sb->s_root->d_inode) {
2973 struct lookup_intent it = { .it_op = IT_LOOKUP };
2975 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2976 MDS_INODELOCK_LOOKUP);
2981 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2982 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2984 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2985 return lustre_check_remote_perm(inode, mask);
2987 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2988 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
2993 #ifdef HAVE_FILE_READV
2994 #define READ_METHOD readv
2995 #define READ_FUNCTION ll_file_readv
2996 #define WRITE_METHOD writev
2997 #define WRITE_FUNCTION ll_file_writev
2999 #define READ_METHOD aio_read
3000 #define READ_FUNCTION ll_file_aio_read
3001 #define WRITE_METHOD aio_write
3002 #define WRITE_FUNCTION ll_file_aio_write
3005 /* -o localflock - only provides locally consistent flock locks */
3006 struct file_operations ll_file_operations = {
3007 .read = ll_file_read,
3008 .READ_METHOD = READ_FUNCTION,
3009 .write = ll_file_write,
3010 .WRITE_METHOD = WRITE_FUNCTION,
3011 .unlocked_ioctl = ll_file_ioctl,
3012 .open = ll_file_open,
3013 .release = ll_file_release,
3014 .mmap = ll_file_mmap,
3015 .llseek = ll_file_seek,
3016 .splice_read = ll_file_splice_read,
3021 struct file_operations ll_file_operations_flock = {
3022 .read = ll_file_read,
3023 .READ_METHOD = READ_FUNCTION,
3024 .write = ll_file_write,
3025 .WRITE_METHOD = WRITE_FUNCTION,
3026 .unlocked_ioctl = ll_file_ioctl,
3027 .open = ll_file_open,
3028 .release = ll_file_release,
3029 .mmap = ll_file_mmap,
3030 .llseek = ll_file_seek,
3031 .splice_read = ll_file_splice_read,
3034 .flock = ll_file_flock,
3035 .lock = ll_file_flock
3038 /* These are for -o noflock - to return ENOSYS on flock calls */
3039 struct file_operations ll_file_operations_noflock = {
3040 .read = ll_file_read,
3041 .READ_METHOD = READ_FUNCTION,
3042 .write = ll_file_write,
3043 .WRITE_METHOD = WRITE_FUNCTION,
3044 .unlocked_ioctl = ll_file_ioctl,
3045 .open = ll_file_open,
3046 .release = ll_file_release,
3047 .mmap = ll_file_mmap,
3048 .llseek = ll_file_seek,
3049 .splice_read = ll_file_splice_read,
3052 .flock = ll_file_noflock,
3053 .lock = ll_file_noflock
3056 struct inode_operations ll_file_inode_operations = {
3057 .setattr = ll_setattr,
3058 .getattr = ll_getattr,
3059 .permission = ll_inode_permission,
3060 .setxattr = ll_setxattr,
3061 .getxattr = ll_getxattr,
3062 .listxattr = ll_listxattr,
3063 .removexattr = ll_removexattr,
3064 #ifdef HAVE_LINUX_FIEMAP_H
3065 .fiemap = ll_fiemap,
3067 #ifdef HAVE_IOP_GET_ACL
3068 .get_acl = ll_get_acl,
3072 /* dynamic ioctl number support routins */
3073 static struct llioc_ctl_data {
3074 struct rw_semaphore ioc_sem;
3075 cfs_list_t ioc_head;
3077 __RWSEM_INITIALIZER(llioc.ioc_sem),
3078 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3083 cfs_list_t iocd_list;
3084 unsigned int iocd_size;
3085 llioc_callback_t iocd_cb;
3086 unsigned int iocd_count;
3087 unsigned int iocd_cmd[0];
3090 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3093 struct llioc_data *in_data = NULL;
3096 if (cb == NULL || cmd == NULL ||
3097 count > LLIOC_MAX_CMD || count < 0)
3100 size = sizeof(*in_data) + count * sizeof(unsigned int);
3101 OBD_ALLOC(in_data, size);
3102 if (in_data == NULL)
3105 memset(in_data, 0, sizeof(*in_data));
3106 in_data->iocd_size = size;
3107 in_data->iocd_cb = cb;
3108 in_data->iocd_count = count;
3109 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3111 down_write(&llioc.ioc_sem);
3112 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3113 up_write(&llioc.ioc_sem);
3118 void ll_iocontrol_unregister(void *magic)
3120 struct llioc_data *tmp;
3125 down_write(&llioc.ioc_sem);
3126 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3128 unsigned int size = tmp->iocd_size;
3130 cfs_list_del(&tmp->iocd_list);
3131 up_write(&llioc.ioc_sem);
3133 OBD_FREE(tmp, size);
3137 up_write(&llioc.ioc_sem);
3139 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3142 EXPORT_SYMBOL(ll_iocontrol_register);
3143 EXPORT_SYMBOL(ll_iocontrol_unregister);
3145 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3146 unsigned int cmd, unsigned long arg, int *rcp)
3148 enum llioc_iter ret = LLIOC_CONT;
3149 struct llioc_data *data;
3150 int rc = -EINVAL, i;
3152 down_read(&llioc.ioc_sem);
3153 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3154 for (i = 0; i < data->iocd_count; i++) {
3155 if (cmd != data->iocd_cmd[i])
3158 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3162 if (ret == LLIOC_STOP)
3165 up_read(&llioc.ioc_sem);
3172 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3174 struct ll_inode_info *lli = ll_i2info(inode);
3175 struct cl_env_nest nest;
3180 if (lli->lli_clob == NULL)
3183 env = cl_env_nested_get(&nest);
3185 RETURN(PTR_ERR(env));
3187 result = cl_conf_set(env, lli->lli_clob, conf);
3188 cl_env_nested_put(&nest, env);
3190 if (conf->coc_opc == OBJECT_CONF_SET) {
3191 struct ldlm_lock *lock = conf->coc_lock;
3193 LASSERT(lock != NULL);
3194 LASSERT(ldlm_has_layout(lock));
3196 /* it can only be allowed to match after layout is
3197 * applied to inode otherwise false layout would be
3198 * seen. Applying layout shoud happen before dropping
3199 * the intent lock. */
3200 ldlm_lock_allow_match(lock);
3206 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3207 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3210 struct ll_sb_info *sbi = ll_i2sbi(inode);
3211 struct obd_capa *oc;
3212 struct ptlrpc_request *req;
3213 struct mdt_body *body;
3220 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3221 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3222 lock->l_lvb_data, lock->l_lvb_len);
3224 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3227 /* if layout lock was granted right away, the layout is returned
3228 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3229 * blocked and then granted via completion ast, we have to fetch
3230 * layout here. Please note that we can't use the LVB buffer in
3231 * completion AST because it doesn't have a large enough buffer */
3232 oc = ll_mdscapa_get(inode);
3233 rc = ll_get_max_mdsize(sbi, &lmmsize);
3235 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3236 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3242 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3243 if (body == NULL || body->eadatasize > lmmsize)
3244 GOTO(out, rc = -EPROTO);
3246 lmmsize = body->eadatasize;
3247 if (lmmsize == 0) /* empty layout */
3250 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3252 GOTO(out, rc = -EFAULT);
3254 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3255 if (lvbdata == NULL)
3256 GOTO(out, rc = -ENOMEM);
3258 memcpy(lvbdata, lmm, lmmsize);
3259 lock_res_and_lock(lock);
3260 if (lock->l_lvb_data != NULL)
3261 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3263 lock->l_lvb_data = lvbdata;
3264 lock->l_lvb_len = lmmsize;
3265 unlock_res_and_lock(lock);
3270 ptlrpc_req_finished(req);
3275 * Apply the layout to the inode. Layout lock is held and will be released
3278 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3279 struct inode *inode, __u32 *gen, bool reconf)
3281 struct ll_inode_info *lli = ll_i2info(inode);
3282 struct ll_sb_info *sbi = ll_i2sbi(inode);
3283 struct ldlm_lock *lock;
3284 struct lustre_md md = { NULL };
3285 struct cl_object_conf conf;
3288 bool wait_layout = false;
3291 LASSERT(lustre_handle_is_used(lockh));
3293 lock = ldlm_handle2lock(lockh);
3294 LASSERT(lock != NULL);
3295 LASSERT(ldlm_has_layout(lock));
3297 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3298 inode, PFID(&lli->lli_fid), reconf);
3300 /* in case this is a caching lock and reinstate with new inode */
3301 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3303 lock_res_and_lock(lock);
3304 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3305 unlock_res_and_lock(lock);
3306 /* checking lvb_ready is racy but this is okay. The worst case is
3307 * that multi processes may configure the file on the same time. */
3308 if (lvb_ready || !reconf) {
3311 /* layout_gen must be valid if layout lock is not
3312 * cancelled and stripe has already set */
3313 *gen = lli->lli_layout_gen;
3319 rc = ll_layout_fetch(inode, lock);
3323 /* for layout lock, lmm is returned in lock's lvb.
3324 * lvb_data is immutable if the lock is held so it's safe to access it
3325 * without res lock. See the description in ldlm_lock_decref_internal()
3326 * for the condition to free lvb_data of layout lock */
3327 if (lock->l_lvb_data != NULL) {
3328 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3329 lock->l_lvb_data, lock->l_lvb_len);
3331 *gen = LL_LAYOUT_GEN_EMPTY;
3333 *gen = md.lsm->lsm_layout_gen;
3336 CERROR("%s: file "DFID" unpackmd error: %d\n",
3337 ll_get_fsname(inode->i_sb, NULL, 0),
3338 PFID(&lli->lli_fid), rc);
3344 /* set layout to file. Unlikely this will fail as old layout was
3345 * surely eliminated */
3346 memset(&conf, 0, sizeof conf);
3347 conf.coc_opc = OBJECT_CONF_SET;
3348 conf.coc_inode = inode;
3349 conf.coc_lock = lock;
3350 conf.u.coc_md = &md;
3351 rc = ll_layout_conf(inode, &conf);
3354 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3356 /* refresh layout failed, need to wait */
3357 wait_layout = rc == -EBUSY;
3361 LDLM_LOCK_PUT(lock);
3362 ldlm_lock_decref(lockh, mode);
3364 /* wait for IO to complete if it's still being used. */
3366 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3367 ll_get_fsname(inode->i_sb, NULL, 0),
3368 inode, PFID(&lli->lli_fid));
3370 memset(&conf, 0, sizeof conf);
3371 conf.coc_opc = OBJECT_CONF_WAIT;
3372 conf.coc_inode = inode;
3373 rc = ll_layout_conf(inode, &conf);
3377 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3378 PFID(&lli->lli_fid), rc);
3384 * This function checks if there exists a LAYOUT lock on the client side,
3385 * or enqueues it if it doesn't have one in cache.
3387 * This function will not hold layout lock so it may be revoked any time after
3388 * this function returns. Any operations depend on layout should be redone
3391 * This function should be called before lov_io_init() to get an uptodate
3392 * layout version, the caller should save the version number and after IO
3393 * is finished, this function should be called again to verify that layout
3394 * is not changed during IO time.
3396 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3398 struct ll_inode_info *lli = ll_i2info(inode);
3399 struct ll_sb_info *sbi = ll_i2sbi(inode);
3400 struct md_op_data *op_data;
3401 struct lookup_intent it;
3402 struct lustre_handle lockh;
3404 struct ldlm_enqueue_info einfo = {
3405 .ei_type = LDLM_IBITS,
3407 .ei_cb_bl = ll_md_blocking_ast,
3408 .ei_cb_cp = ldlm_completion_ast,
3413 *gen = lli->lli_layout_gen;
3414 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3418 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3419 LASSERT(S_ISREG(inode->i_mode));
3421 /* mostly layout lock is caching on the local side, so try to match
3422 * it before grabbing layout lock mutex. */
3423 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3424 if (mode != 0) { /* hit cached lock */
3425 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3429 /* better hold lli_layout_mutex to try again otherwise
3430 * it will have starvation problem. */
3433 /* take layout lock mutex to enqueue layout lock exclusively. */
3434 mutex_lock(&lli->lli_layout_mutex);
3437 /* try again. Maybe somebody else has done this. */
3438 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3439 if (mode != 0) { /* hit cached lock */
3440 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3444 mutex_unlock(&lli->lli_layout_mutex);
3448 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3449 0, 0, LUSTRE_OPC_ANY, NULL);
3450 if (IS_ERR(op_data)) {
3451 mutex_unlock(&lli->lli_layout_mutex);
3452 RETURN(PTR_ERR(op_data));
3455 /* have to enqueue one */
3456 memset(&it, 0, sizeof(it));
3457 it.it_op = IT_LAYOUT;
3458 lockh.cookie = 0ULL;
3460 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3461 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3462 PFID(&lli->lli_fid));
3464 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3466 if (it.d.lustre.it_data != NULL)
3467 ptlrpc_req_finished(it.d.lustre.it_data);
3468 it.d.lustre.it_data = NULL;
3470 ll_finish_md_op_data(op_data);
3472 mode = it.d.lustre.it_lock_mode;
3473 it.d.lustre.it_lock_mode = 0;
3474 ll_intent_drop_lock(&it);
3477 /* set lock data in case this is a new lock */
3478 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3479 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3483 mutex_unlock(&lli->lli_layout_mutex);