4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och)
125 struct obd_export *exp = ll_i2mdexp(inode);
126 struct md_op_data *op_data;
127 struct ptlrpc_request *req = NULL;
128 struct obd_device *obd = class_exp2obd(exp);
135 * XXX: in case of LMV, is this correct to access
138 CERROR("Invalid MDC connection handle "LPX64"\n",
139 ll_i2mdexp(inode)->exp_handle.h_cookie);
143 OBD_ALLOC_PTR(op_data);
145 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
147 ll_prepare_close(inode, op_data, och);
148 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
149 rc = md_close(md_exp, op_data, och->och_mod, &req);
151 /* This close must have the epoch closed. */
152 LASSERT(epoch_close);
153 /* MDS has instructed us to obtain Size-on-MDS attribute from
154 * OSTs and send setattr to back to MDS. */
155 rc = ll_som_update(inode, op_data);
157 CERROR("inode %lu mdc Size-on-MDS update failed: "
158 "rc = %d\n", inode->i_ino, rc);
162 CERROR("inode %lu mdc close failed: rc = %d\n",
166 /* DATA_MODIFIED flag was successfully sent on close, cancel data
167 * modification flag. */
168 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
169 struct ll_inode_info *lli = ll_i2info(inode);
171 spin_lock(&lli->lli_lock);
172 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
173 spin_unlock(&lli->lli_lock);
176 ll_finish_md_op_data(op_data);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
188 if (exp_connect_som(exp) && !epoch_close &&
189 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
190 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
192 md_clear_open_replay_data(md_exp, och);
193 /* Free @och if it is not waiting for DONE_WRITING. */
194 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
197 if (req) /* This is close request */
198 ptlrpc_req_finished(req);
202 int ll_md_real_close(struct inode *inode, int flags)
204 struct ll_inode_info *lli = ll_i2info(inode);
205 struct obd_client_handle **och_p;
206 struct obd_client_handle *och;
211 if (flags & FMODE_WRITE) {
212 och_p = &lli->lli_mds_write_och;
213 och_usecount = &lli->lli_open_fd_write_count;
214 } else if (flags & FMODE_EXEC) {
215 och_p = &lli->lli_mds_exec_och;
216 och_usecount = &lli->lli_open_fd_exec_count;
218 LASSERT(flags & FMODE_READ);
219 och_p = &lli->lli_mds_read_och;
220 och_usecount = &lli->lli_open_fd_read_count;
223 mutex_lock(&lli->lli_och_mutex);
224 if (*och_usecount) { /* There are still users of this handle, so
226 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
233 if (och) { /* There might be a race and somebody have freed this och
235 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
242 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
254 /* Let's see if we have good enough OPEN lock on the file and if
255 we can skip talking to MDS */
256 if (file->f_dentry->d_inode) { /* Can this ever be false? */
258 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
259 struct lustre_handle lockh;
260 struct inode *inode = file->f_dentry->d_inode;
261 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
263 mutex_lock(&lli->lli_och_mutex);
264 if (fd->fd_omode & FMODE_WRITE) {
266 LASSERT(lli->lli_open_fd_write_count);
267 lli->lli_open_fd_write_count--;
268 } else if (fd->fd_omode & FMODE_EXEC) {
270 LASSERT(lli->lli_open_fd_exec_count);
271 lli->lli_open_fd_exec_count--;
274 LASSERT(lli->lli_open_fd_read_count);
275 lli->lli_open_fd_read_count--;
277 mutex_unlock(&lli->lli_och_mutex);
279 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
280 LDLM_IBITS, &policy, lockmode,
282 rc = ll_md_real_close(file->f_dentry->d_inode,
286 CERROR("Releasing a file %p with negative dentry %p. Name %s",
287 file, file->f_dentry, file->f_dentry->d_name.name);
290 LUSTRE_FPRIVATE(file) = NULL;
291 ll_file_data_put(fd);
292 ll_capa_close(inode);
297 /* While this returns an error code, fput() the caller does not, so we need
298 * to make every effort to clean up all of our state here. Also, applications
299 * rarely check close errors and even if an error is returned they will not
300 * re-try the close call.
302 int ll_file_release(struct inode *inode, struct file *file)
304 struct ll_file_data *fd;
305 struct ll_sb_info *sbi = ll_i2sbi(inode);
306 struct ll_inode_info *lli = ll_i2info(inode);
310 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
311 inode->i_generation, inode);
313 #ifdef CONFIG_FS_POSIX_ACL
314 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
315 inode == inode->i_sb->s_root->d_inode) {
316 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
319 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
320 fd->fd_flags &= ~LL_FILE_RMTACL;
321 rct_del(&sbi->ll_rct, cfs_curproc_pid());
322 et_search_free(&sbi->ll_et, cfs_curproc_pid());
327 if (inode->i_sb->s_root != file->f_dentry)
328 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
329 fd = LUSTRE_FPRIVATE(file);
332 /* The last ref on @file, maybe not the the owner pid of statahead.
333 * Different processes can open the same dir, "ll_opendir_key" means:
334 * it is me that should stop the statahead thread. */
335 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
336 lli->lli_opendir_pid != 0)
337 ll_stop_statahead(inode, lli->lli_opendir_key);
339 if (inode->i_sb->s_root == file->f_dentry) {
340 LUSTRE_FPRIVATE(file) = NULL;
341 ll_file_data_put(fd);
345 if (!S_ISDIR(inode->i_mode)) {
346 lov_read_and_clear_async_rc(lli->lli_clob);
347 lli->lli_async_rc = 0;
350 rc = ll_md_close(sbi->ll_md_exp, inode, file);
352 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
353 libcfs_debug_dumplog();
358 static int ll_intent_file_open(struct file *file, void *lmm,
359 int lmmsize, struct lookup_intent *itp)
361 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
362 struct dentry *parent = file->f_dentry->d_parent;
363 const char *name = file->f_dentry->d_name.name;
364 const int len = file->f_dentry->d_name.len;
365 struct md_op_data *op_data;
366 struct ptlrpc_request *req;
367 __u32 opc = LUSTRE_OPC_ANY;
374 /* Usually we come here only for NFSD, and we want open lock.
375 But we can also get here with pre 2.6.15 patchless kernels, and in
376 that case that lock is also ok */
377 /* We can also get here if there was cached open handle in revalidate_it
378 * but it disappeared while we were getting from there to ll_file_open.
379 * But this means this file was closed and immediatelly opened which
380 * makes a good candidate for using OPEN lock */
381 /* If lmmsize & lmm are not 0, we are just setting stripe info
382 * parameters. No need for the open lock */
383 if (lmm == NULL && lmmsize == 0) {
384 itp->it_flags |= MDS_OPEN_LOCK;
385 if (itp->it_flags & FMODE_WRITE)
386 opc = LUSTRE_OPC_CREATE;
389 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
390 file->f_dentry->d_inode, name, len,
393 RETURN(PTR_ERR(op_data));
395 itp->it_flags |= MDS_OPEN_BY_FID;
396 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
397 0 /*unused */, &req, ll_md_blocking_ast, 0);
398 ll_finish_md_op_data(op_data);
400 /* reason for keep own exit path - don`t flood log
401 * with messages with -ESTALE errors.
403 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
404 it_open_error(DISP_OPEN_OPEN, itp))
406 ll_release_openhandle(file->f_dentry, itp);
410 if (it_disposition(itp, DISP_LOOKUP_NEG))
411 GOTO(out, rc = -ENOENT);
413 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
414 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
415 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
419 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
420 if (!rc && itp->d.lustre.it_lock_mode)
421 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
425 ptlrpc_req_finished(itp->d.lustre.it_data);
426 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
427 ll_intent_drop_lock(itp);
433 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
434 * not believe attributes if a few ioepoch holders exist. Attributes for
435 * previous ioepoch if new one is opened are also skipped by MDS.
437 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
439 if (ioepoch && lli->lli_ioepoch != ioepoch) {
440 lli->lli_ioepoch = ioepoch;
441 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
442 ioepoch, PFID(&lli->lli_fid));
446 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
447 struct lookup_intent *it, struct obd_client_handle *och)
449 struct ptlrpc_request *req = it->d.lustre.it_data;
450 struct mdt_body *body;
454 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
455 LASSERT(body != NULL); /* reply already checked out */
457 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
458 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
459 och->och_fid = lli->lli_fid;
460 och->och_flags = it->it_flags;
461 ll_ioepoch_open(lli, body->ioepoch);
463 return md_set_open_replay_data(md_exp, och, req);
466 int ll_local_open(struct file *file, struct lookup_intent *it,
467 struct ll_file_data *fd, struct obd_client_handle *och)
469 struct inode *inode = file->f_dentry->d_inode;
470 struct ll_inode_info *lli = ll_i2info(inode);
473 LASSERT(!LUSTRE_FPRIVATE(file));
478 struct ptlrpc_request *req = it->d.lustre.it_data;
479 struct mdt_body *body;
482 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
486 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
487 if ((it->it_flags & FMODE_WRITE) &&
488 (body->valid & OBD_MD_FLSIZE))
489 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
490 lli->lli_ioepoch, PFID(&lli->lli_fid));
493 LUSTRE_FPRIVATE(file) = fd;
494 ll_readahead_init(inode, &fd->fd_ras);
495 fd->fd_omode = it->it_flags;
499 /* Open a file, and (for the very first open) create objects on the OSTs at
500 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
501 * creation or open until ll_lov_setstripe() ioctl is called.
503 * If we already have the stripe MD locally then we don't request it in
504 * md_open(), by passing a lmm_size = 0.
506 * It is up to the application to ensure no other processes open this file
507 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
508 * used. We might be able to avoid races of that sort by getting lli_open_sem
509 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
510 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
512 int ll_file_open(struct inode *inode, struct file *file)
514 struct ll_inode_info *lli = ll_i2info(inode);
515 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
516 .it_flags = file->f_flags };
517 struct obd_client_handle **och_p = NULL;
518 __u64 *och_usecount = NULL;
519 struct ll_file_data *fd;
520 int rc = 0, opendir_set = 0;
523 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
524 inode->i_generation, inode, file->f_flags);
526 it = file->private_data; /* XXX: compat macro */
527 file->private_data = NULL; /* prevent ll_local_open assertion */
529 fd = ll_file_data_get();
531 GOTO(out_openerr, rc = -ENOMEM);
534 if (S_ISDIR(inode->i_mode)) {
535 spin_lock(&lli->lli_sa_lock);
536 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
537 lli->lli_opendir_pid == 0) {
538 lli->lli_opendir_key = fd;
539 lli->lli_opendir_pid = cfs_curproc_pid();
542 spin_unlock(&lli->lli_sa_lock);
545 if (inode->i_sb->s_root == file->f_dentry) {
546 LUSTRE_FPRIVATE(file) = fd;
550 if (!it || !it->d.lustre.it_disposition) {
551 /* Convert f_flags into access mode. We cannot use file->f_mode,
552 * because everything but O_ACCMODE mask was stripped from
554 if ((oit.it_flags + 1) & O_ACCMODE)
556 if (file->f_flags & O_TRUNC)
557 oit.it_flags |= FMODE_WRITE;
559 /* kernel only call f_op->open in dentry_open. filp_open calls
560 * dentry_open after call to open_namei that checks permissions.
561 * Only nfsd_open call dentry_open directly without checking
562 * permissions and because of that this code below is safe. */
563 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
564 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
566 /* We do not want O_EXCL here, presumably we opened the file
567 * already? XXX - NFS implications? */
568 oit.it_flags &= ~O_EXCL;
570 /* bug20584, if "it_flags" contains O_CREAT, the file will be
571 * created if necessary, then "IT_CREAT" should be set to keep
572 * consistent with it */
573 if (oit.it_flags & O_CREAT)
574 oit.it_op |= IT_CREAT;
580 /* Let's see if we have file open on MDS already. */
581 if (it->it_flags & FMODE_WRITE) {
582 och_p = &lli->lli_mds_write_och;
583 och_usecount = &lli->lli_open_fd_write_count;
584 } else if (it->it_flags & FMODE_EXEC) {
585 och_p = &lli->lli_mds_exec_och;
586 och_usecount = &lli->lli_open_fd_exec_count;
588 och_p = &lli->lli_mds_read_och;
589 och_usecount = &lli->lli_open_fd_read_count;
592 mutex_lock(&lli->lli_och_mutex);
593 if (*och_p) { /* Open handle is present */
594 if (it_disposition(it, DISP_OPEN_OPEN)) {
595 /* Well, there's extra open request that we do not need,
596 let's close it somehow. This will decref request. */
597 rc = it_open_error(DISP_OPEN_OPEN, it);
599 mutex_unlock(&lli->lli_och_mutex);
600 GOTO(out_openerr, rc);
603 ll_release_openhandle(file->f_dentry, it);
607 rc = ll_local_open(file, it, fd, NULL);
610 mutex_unlock(&lli->lli_och_mutex);
611 GOTO(out_openerr, rc);
614 LASSERT(*och_usecount == 0);
615 if (!it->d.lustre.it_disposition) {
616 /* We cannot just request lock handle now, new ELC code
617 means that one of other OPEN locks for this file
618 could be cancelled, and since blocking ast handler
619 would attempt to grab och_mutex as well, that would
620 result in a deadlock */
621 mutex_unlock(&lli->lli_och_mutex);
622 it->it_create_mode |= M_CHECK_STALE;
623 rc = ll_intent_file_open(file, NULL, 0, it);
624 it->it_create_mode &= ~M_CHECK_STALE;
626 GOTO(out_openerr, rc);
630 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
632 GOTO(out_och_free, rc = -ENOMEM);
636 /* md_intent_lock() didn't get a request ref if there was an
637 * open error, so don't do cleanup on the request here
639 /* XXX (green): Should not we bail out on any error here, not
640 * just open error? */
641 rc = it_open_error(DISP_OPEN_OPEN, it);
643 GOTO(out_och_free, rc);
645 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
647 rc = ll_local_open(file, it, fd, *och_p);
649 GOTO(out_och_free, rc);
651 mutex_unlock(&lli->lli_och_mutex);
654 /* Must do this outside lli_och_mutex lock to prevent deadlock where
655 different kind of OPEN lock for this same inode gets cancelled
656 by ldlm_cancel_lru */
657 if (!S_ISREG(inode->i_mode))
658 GOTO(out_och_free, rc);
662 if (!lli->lli_has_smd) {
663 if (file->f_flags & O_LOV_DELAY_CREATE ||
664 !(file->f_mode & FMODE_WRITE)) {
665 CDEBUG(D_INODE, "object creation was delayed\n");
666 GOTO(out_och_free, rc);
669 file->f_flags &= ~O_LOV_DELAY_CREATE;
670 GOTO(out_och_free, rc);
674 if (och_p && *och_p) {
675 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
676 *och_p = NULL; /* OBD_FREE writes some magic there */
679 mutex_unlock(&lli->lli_och_mutex);
682 if (opendir_set != 0)
683 ll_stop_statahead(inode, lli->lli_opendir_key);
685 ll_file_data_put(fd);
687 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
690 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
691 ptlrpc_req_finished(it->d.lustre.it_data);
692 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
698 /* Fills the obdo with the attributes for the lsm */
699 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
700 struct obd_capa *capa, struct obdo *obdo,
701 __u64 ioepoch, int sync)
703 struct ptlrpc_request_set *set;
704 struct obd_info oinfo = { { { 0 } } };
709 LASSERT(lsm != NULL);
713 oinfo.oi_oa->o_oi = lsm->lsm_oi;
714 oinfo.oi_oa->o_mode = S_IFREG;
715 oinfo.oi_oa->o_ioepoch = ioepoch;
716 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
717 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
718 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
719 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
720 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
721 OBD_MD_FLDATAVERSION;
722 oinfo.oi_capa = capa;
724 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
725 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
728 set = ptlrpc_prep_set();
730 CERROR("can't allocate ptlrpc set\n");
733 rc = obd_getattr_async(exp, &oinfo, set);
735 rc = ptlrpc_set_wait(set);
736 ptlrpc_set_destroy(set);
739 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
740 OBD_MD_FLATIME | OBD_MD_FLMTIME |
741 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
742 OBD_MD_FLDATAVERSION);
747 * Performs the getattr on the inode and updates its fields.
748 * If @sync != 0, perform the getattr under the server-side lock.
750 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
751 __u64 ioepoch, int sync)
753 struct obd_capa *capa = ll_mdscapa_get(inode);
754 struct lov_stripe_md *lsm;
758 lsm = ccc_inode_lsm_get(inode);
759 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
760 capa, obdo, ioepoch, sync);
763 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
765 obdo_refresh_inode(inode, obdo, obdo->o_valid);
766 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
767 " blksize %lu\n", POSTID(oi), i_size_read(inode),
768 (unsigned long long)inode->i_blocks,
769 (unsigned long)ll_inode_blksize(inode));
771 ccc_inode_lsm_put(inode, lsm);
775 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct cl_object *obj = lli->lli_clob;
779 struct cl_attr *attr = ccc_env_thread_attr(env);
785 ll_inode_size_lock(inode);
786 /* merge timestamps the most recently obtained from mds with
787 timestamps obtained from osts */
788 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
789 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
790 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
791 inode_init_lvb(inode, &lvb);
793 cl_object_attr_lock(obj);
794 rc = cl_object_attr_get(env, obj, attr);
795 cl_object_attr_unlock(obj);
798 if (lvb.lvb_atime < attr->cat_atime)
799 lvb.lvb_atime = attr->cat_atime;
800 if (lvb.lvb_ctime < attr->cat_ctime)
801 lvb.lvb_ctime = attr->cat_ctime;
802 if (lvb.lvb_mtime < attr->cat_mtime)
803 lvb.lvb_mtime = attr->cat_mtime;
805 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
806 PFID(&lli->lli_fid), attr->cat_size);
807 cl_isize_write_nolock(inode, attr->cat_size);
809 inode->i_blocks = attr->cat_blocks;
811 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
812 LTIME_S(inode->i_atime) = lvb.lvb_atime;
813 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
815 ll_inode_size_unlock(inode);
820 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
823 struct obdo obdo = { 0 };
826 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
828 st->st_size = obdo.o_size;
829 st->st_blocks = obdo.o_blocks;
830 st->st_mtime = obdo.o_mtime;
831 st->st_atime = obdo.o_atime;
832 st->st_ctime = obdo.o_ctime;
837 void ll_io_init(struct cl_io *io, const struct file *file, int write)
839 struct inode *inode = file->f_dentry->d_inode;
841 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
843 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
844 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
845 file->f_flags & O_DIRECT ||
848 io->ci_obj = ll_i2info(inode)->lli_clob;
849 io->ci_lockreq = CILR_MAYBE;
850 if (ll_file_nolock(file)) {
851 io->ci_lockreq = CILR_NEVER;
852 io->ci_no_srvlock = 1;
853 } else if (file->f_flags & O_APPEND) {
854 io->ci_lockreq = CILR_MANDATORY;
859 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
860 struct file *file, enum cl_io_type iot,
861 loff_t *ppos, size_t count)
863 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
864 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
870 io = ccc_env_thread_io(env);
871 ll_io_init(io, file, iot == CIT_WRITE);
873 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
874 struct vvp_io *vio = vvp_env_io(env);
875 struct ccc_io *cio = ccc_env_io(env);
876 int write_mutex_locked = 0;
878 cio->cui_fd = LUSTRE_FPRIVATE(file);
879 vio->cui_io_subtype = args->via_io_subtype;
881 switch (vio->cui_io_subtype) {
883 cio->cui_iov = args->u.normal.via_iov;
884 cio->cui_nrsegs = args->u.normal.via_nrsegs;
885 cio->cui_tot_nrsegs = cio->cui_nrsegs;
886 #ifndef HAVE_FILE_WRITEV
887 cio->cui_iocb = args->u.normal.via_iocb;
889 if ((iot == CIT_WRITE) &&
890 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
891 if (mutex_lock_interruptible(&lli->
893 GOTO(out, result = -ERESTARTSYS);
894 write_mutex_locked = 1;
895 } else if (iot == CIT_READ) {
896 down_read(&lli->lli_trunc_sem);
900 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
901 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
904 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
905 vio->u.splice.cui_flags = args->u.splice.via_flags;
908 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
911 result = cl_io_loop(env, io);
912 if (write_mutex_locked)
913 mutex_unlock(&lli->lli_write_mutex);
914 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
915 up_read(&lli->lli_trunc_sem);
917 /* cl_io_rw_init() handled IO */
918 result = io->ci_result;
921 if (io->ci_nob > 0) {
923 *ppos = io->u.ci_wr.wr.crw_pos;
928 /* If any bit been read/written (result != 0), we just return
929 * short read/write instead of restart io. */
930 if (result == 0 && io->ci_need_restart) {
931 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
932 iot == CIT_READ ? "read" : "write",
933 file->f_dentry->d_name.name, *ppos, count);
934 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
938 if (iot == CIT_READ) {
940 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
941 LPROC_LL_READ_BYTES, result);
942 } else if (iot == CIT_WRITE) {
944 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
945 LPROC_LL_WRITE_BYTES, result);
946 fd->fd_write_failed = false;
947 } else if (result != -ERESTARTSYS) {
948 fd->fd_write_failed = true;
957 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
959 static int ll_file_get_iov_count(const struct iovec *iov,
960 unsigned long *nr_segs, size_t *count)
965 for (seg = 0; seg < *nr_segs; seg++) {
966 const struct iovec *iv = &iov[seg];
969 * If any segment has a negative length, or the cumulative
970 * length ever wraps negative then return -EINVAL.
973 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
975 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
980 cnt -= iv->iov_len; /* This segment is no good */
987 #ifdef HAVE_FILE_READV
988 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
989 unsigned long nr_segs, loff_t *ppos)
992 struct vvp_io_args *args;
998 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1002 env = cl_env_get(&refcheck);
1004 RETURN(PTR_ERR(env));
1006 args = vvp_env_args(env, IO_NORMAL);
1007 args->u.normal.via_iov = (struct iovec *)iov;
1008 args->u.normal.via_nrsegs = nr_segs;
1010 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1011 cl_env_put(env, &refcheck);
1015 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1019 struct iovec *local_iov;
1024 env = cl_env_get(&refcheck);
1026 RETURN(PTR_ERR(env));
1028 local_iov = &vvp_env_info(env)->vti_local_iov;
1029 local_iov->iov_base = (void __user *)buf;
1030 local_iov->iov_len = count;
1031 result = ll_file_readv(file, local_iov, 1, ppos);
1032 cl_env_put(env, &refcheck);
1037 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1038 unsigned long nr_segs, loff_t pos)
1041 struct vvp_io_args *args;
1047 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1051 env = cl_env_get(&refcheck);
1053 RETURN(PTR_ERR(env));
1055 args = vvp_env_args(env, IO_NORMAL);
1056 args->u.normal.via_iov = (struct iovec *)iov;
1057 args->u.normal.via_nrsegs = nr_segs;
1058 args->u.normal.via_iocb = iocb;
1060 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1061 &iocb->ki_pos, count);
1062 cl_env_put(env, &refcheck);
1066 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1070 struct iovec *local_iov;
1071 struct kiocb *kiocb;
1076 env = cl_env_get(&refcheck);
1078 RETURN(PTR_ERR(env));
1080 local_iov = &vvp_env_info(env)->vti_local_iov;
1081 kiocb = &vvp_env_info(env)->vti_kiocb;
1082 local_iov->iov_base = (void __user *)buf;
1083 local_iov->iov_len = count;
1084 init_sync_kiocb(kiocb, file);
1085 kiocb->ki_pos = *ppos;
1086 kiocb->ki_left = count;
1088 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1089 *ppos = kiocb->ki_pos;
1091 cl_env_put(env, &refcheck);
1097 * Write to a file (through the page cache).
1099 #ifdef HAVE_FILE_WRITEV
1100 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1101 unsigned long nr_segs, loff_t *ppos)
1104 struct vvp_io_args *args;
1110 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1114 env = cl_env_get(&refcheck);
1116 RETURN(PTR_ERR(env));
1118 args = vvp_env_args(env, IO_NORMAL);
1119 args->u.normal.via_iov = (struct iovec *)iov;
1120 args->u.normal.via_nrsegs = nr_segs;
1122 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1123 cl_env_put(env, &refcheck);
1127 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1131 struct iovec *local_iov;
1136 env = cl_env_get(&refcheck);
1138 RETURN(PTR_ERR(env));
1140 local_iov = &vvp_env_info(env)->vti_local_iov;
1141 local_iov->iov_base = (void __user *)buf;
1142 local_iov->iov_len = count;
1144 result = ll_file_writev(file, local_iov, 1, ppos);
1145 cl_env_put(env, &refcheck);
1149 #else /* AIO stuff */
1150 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1151 unsigned long nr_segs, loff_t pos)
1154 struct vvp_io_args *args;
1160 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1164 env = cl_env_get(&refcheck);
1166 RETURN(PTR_ERR(env));
1168 args = vvp_env_args(env, IO_NORMAL);
1169 args->u.normal.via_iov = (struct iovec *)iov;
1170 args->u.normal.via_nrsegs = nr_segs;
1171 args->u.normal.via_iocb = iocb;
1173 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1174 &iocb->ki_pos, count);
1175 cl_env_put(env, &refcheck);
1179 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1183 struct iovec *local_iov;
1184 struct kiocb *kiocb;
1189 env = cl_env_get(&refcheck);
1191 RETURN(PTR_ERR(env));
1193 local_iov = &vvp_env_info(env)->vti_local_iov;
1194 kiocb = &vvp_env_info(env)->vti_kiocb;
1195 local_iov->iov_base = (void __user *)buf;
1196 local_iov->iov_len = count;
1197 init_sync_kiocb(kiocb, file);
1198 kiocb->ki_pos = *ppos;
1199 kiocb->ki_left = count;
1201 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1202 *ppos = kiocb->ki_pos;
1204 cl_env_put(env, &refcheck);
1210 #ifdef HAVE_KERNEL_SENDFILE
1212 * Send file content (through pagecache) somewhere with helper
1214 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1215 read_actor_t actor, void *target)
1218 struct vvp_io_args *args;
1223 env = cl_env_get(&refcheck);
1225 RETURN(PTR_ERR(env));
1227 args = vvp_env_args(env, IO_SENDFILE);
1228 args->u.sendfile.via_target = target;
1229 args->u.sendfile.via_actor = actor;
1231 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1232 cl_env_put(env, &refcheck);
1237 #ifdef HAVE_KERNEL_SPLICE_READ
1239 * Send file content (through pagecache) somewhere with helper
1241 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1242 struct pipe_inode_info *pipe, size_t count,
1246 struct vvp_io_args *args;
1251 env = cl_env_get(&refcheck);
1253 RETURN(PTR_ERR(env));
1255 args = vvp_env_args(env, IO_SPLICE);
1256 args->u.splice.via_pipe = pipe;
1257 args->u.splice.via_flags = flags;
1259 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1260 cl_env_put(env, &refcheck);
1265 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1268 struct obd_export *exp = ll_i2dtexp(inode);
1269 struct obd_trans_info oti = { 0 };
1270 struct obdo *oa = NULL;
1273 struct lov_stripe_md *lsm = NULL, *lsm2;
1280 lsm = ccc_inode_lsm_get(inode);
1282 GOTO(out, rc = -ENOENT);
1284 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1285 (lsm->lsm_stripe_count));
1287 OBD_ALLOC_LARGE(lsm2, lsm_size);
1289 GOTO(out, rc = -ENOMEM);
1292 oa->o_nlink = ost_idx;
1293 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1294 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1295 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1296 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1297 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1298 memcpy(lsm2, lsm, lsm_size);
1299 ll_inode_size_lock(inode);
1300 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1301 ll_inode_size_unlock(inode);
1303 OBD_FREE_LARGE(lsm2, lsm_size);
1306 ccc_inode_lsm_put(inode, lsm);
1311 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1313 struct ll_recreate_obj ucreat;
1317 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1320 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1324 ostid_set_seq_mdt0(&oi);
1325 ostid_set_id(&oi, ucreat.lrc_id);
1326 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1329 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1336 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1339 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1342 fid_to_ostid(&fid, &oi);
1343 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1344 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1347 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1348 int flags, struct lov_user_md *lum, int lum_size)
1350 struct lov_stripe_md *lsm = NULL;
1351 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1355 lsm = ccc_inode_lsm_get(inode);
1357 ccc_inode_lsm_put(inode, lsm);
1358 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1363 ll_inode_size_lock(inode);
1364 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1367 rc = oit.d.lustre.it_status;
1369 GOTO(out_req_free, rc);
1371 ll_release_openhandle(file->f_dentry, &oit);
1374 ll_inode_size_unlock(inode);
1375 ll_intent_release(&oit);
1376 ccc_inode_lsm_put(inode, lsm);
1379 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1383 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1384 struct lov_mds_md **lmmp, int *lmm_size,
1385 struct ptlrpc_request **request)
1387 struct ll_sb_info *sbi = ll_i2sbi(inode);
1388 struct mdt_body *body;
1389 struct lov_mds_md *lmm = NULL;
1390 struct ptlrpc_request *req = NULL;
1391 struct md_op_data *op_data;
1394 rc = ll_get_max_mdsize(sbi, &lmmsize);
1398 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1399 strlen(filename), lmmsize,
1400 LUSTRE_OPC_ANY, NULL);
1401 if (IS_ERR(op_data))
1402 RETURN(PTR_ERR(op_data));
1404 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1405 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1406 ll_finish_md_op_data(op_data);
1408 CDEBUG(D_INFO, "md_getattr_name failed "
1409 "on %s: rc %d\n", filename, rc);
1413 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1414 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1416 lmmsize = body->eadatasize;
1418 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1420 GOTO(out, rc = -ENODATA);
1423 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1424 LASSERT(lmm != NULL);
1426 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1427 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1428 GOTO(out, rc = -EPROTO);
1432 * This is coming from the MDS, so is probably in
1433 * little endian. We convert it to host endian before
1434 * passing it to userspace.
1436 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1437 /* if function called for directory - we should
1438 * avoid swab not existent lsm objects */
1439 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1440 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1441 if (S_ISREG(body->mode))
1442 lustre_swab_lov_user_md_objects(
1443 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1444 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1445 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1446 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1447 if (S_ISREG(body->mode))
1448 lustre_swab_lov_user_md_objects(
1449 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1450 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1456 *lmm_size = lmmsize;
1461 static int ll_lov_setea(struct inode *inode, struct file *file,
1464 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1465 struct lov_user_md *lump;
1466 int lum_size = sizeof(struct lov_user_md) +
1467 sizeof(struct lov_user_ost_data);
1471 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1474 OBD_ALLOC_LARGE(lump, lum_size);
1478 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1479 OBD_FREE_LARGE(lump, lum_size);
1483 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1485 OBD_FREE_LARGE(lump, lum_size);
1489 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1492 struct lov_user_md_v3 lumv3;
1493 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1494 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1495 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1497 int flags = FMODE_WRITE;
1500 /* first try with v1 which is smaller than v3 */
1501 lum_size = sizeof(struct lov_user_md_v1);
1502 if (copy_from_user(lumv1, lumv1p, lum_size))
1505 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1506 lum_size = sizeof(struct lov_user_md_v3);
1507 if (copy_from_user(&lumv3, lumv3p, lum_size))
1511 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1513 struct lov_stripe_md *lsm;
1516 put_user(0, &lumv1p->lmm_stripe_count);
1518 ll_layout_refresh(inode, &gen);
1519 lsm = ccc_inode_lsm_get(inode);
1520 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1521 0, lsm, (void *)arg);
1522 ccc_inode_lsm_put(inode, lsm);
1527 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1529 struct lov_stripe_md *lsm;
1533 lsm = ccc_inode_lsm_get(inode);
1535 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1537 ccc_inode_lsm_put(inode, lsm);
1541 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1543 struct ll_inode_info *lli = ll_i2info(inode);
1544 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1545 struct ccc_grouplock grouplock;
1549 if (ll_file_nolock(file))
1550 RETURN(-EOPNOTSUPP);
1552 spin_lock(&lli->lli_lock);
1553 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1554 CWARN("group lock already existed with gid %lu\n",
1555 fd->fd_grouplock.cg_gid);
1556 spin_unlock(&lli->lli_lock);
1559 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1560 spin_unlock(&lli->lli_lock);
1562 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1563 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1567 spin_lock(&lli->lli_lock);
1568 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1569 spin_unlock(&lli->lli_lock);
1570 CERROR("another thread just won the race\n");
1571 cl_put_grouplock(&grouplock);
1575 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1576 fd->fd_grouplock = grouplock;
1577 spin_unlock(&lli->lli_lock);
1579 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1583 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1585 struct ll_inode_info *lli = ll_i2info(inode);
1586 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1587 struct ccc_grouplock grouplock;
1590 spin_lock(&lli->lli_lock);
1591 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1592 spin_unlock(&lli->lli_lock);
1593 CWARN("no group lock held\n");
1596 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1598 if (fd->fd_grouplock.cg_gid != arg) {
1599 CWARN("group lock %lu doesn't match current id %lu\n",
1600 arg, fd->fd_grouplock.cg_gid);
1601 spin_unlock(&lli->lli_lock);
1605 grouplock = fd->fd_grouplock;
1606 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1607 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1608 spin_unlock(&lli->lli_lock);
1610 cl_put_grouplock(&grouplock);
1611 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1616 * Close inode open handle
1618 * \param dentry [in] dentry which contains the inode
1619 * \param it [in,out] intent which contains open info and result
1622 * \retval <0 failure
1624 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1626 struct inode *inode = dentry->d_inode;
1627 struct obd_client_handle *och;
1633 /* Root ? Do nothing. */
1634 if (dentry->d_inode->i_sb->s_root == dentry)
1637 /* No open handle to close? Move away */
1638 if (!it_disposition(it, DISP_OPEN_OPEN))
1641 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1643 OBD_ALLOC(och, sizeof(*och));
1645 GOTO(out, rc = -ENOMEM);
1647 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1648 ll_i2info(inode), it, och);
1650 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1653 /* this one is in place of ll_file_open */
1654 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1655 ptlrpc_req_finished(it->d.lustre.it_data);
1656 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1662 * Get size for inode for which FIEMAP mapping is requested.
1663 * Make the FIEMAP get_info call and returns the result.
1665 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1668 struct obd_export *exp = ll_i2dtexp(inode);
1669 struct lov_stripe_md *lsm = NULL;
1670 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1671 int vallen = num_bytes;
1675 /* Checks for fiemap flags */
1676 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1677 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1681 /* Check for FIEMAP_FLAG_SYNC */
1682 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1683 rc = filemap_fdatawrite(inode->i_mapping);
1688 lsm = ccc_inode_lsm_get(inode);
1692 /* If the stripe_count > 1 and the application does not understand
1693 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1695 if (lsm->lsm_stripe_count > 1 &&
1696 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1697 GOTO(out, rc = -EOPNOTSUPP);
1699 fm_key.oa.o_oi = lsm->lsm_oi;
1700 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1702 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1703 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1704 /* If filesize is 0, then there would be no objects for mapping */
1705 if (fm_key.oa.o_size == 0) {
1706 fiemap->fm_mapped_extents = 0;
1710 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1712 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1715 CERROR("obd_get_info failed: rc = %d\n", rc);
1718 ccc_inode_lsm_put(inode, lsm);
1722 int ll_fid2path(struct inode *inode, void *arg)
1724 struct obd_export *exp = ll_i2mdexp(inode);
1725 struct getinfo_fid2path *gfout, *gfin;
1729 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1730 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1733 /* Need to get the buflen */
1734 OBD_ALLOC_PTR(gfin);
1737 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1742 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1743 OBD_ALLOC(gfout, outsize);
1744 if (gfout == NULL) {
1748 memcpy(gfout, gfin, sizeof(*gfout));
1751 /* Call mdc_iocontrol */
1752 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1756 if (copy_to_user(arg, gfout, outsize))
1760 OBD_FREE(gfout, outsize);
1764 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1766 struct ll_user_fiemap *fiemap_s;
1767 size_t num_bytes, ret_bytes;
1768 unsigned int extent_count;
1771 /* Get the extent count so we can calculate the size of
1772 * required fiemap buffer */
1773 if (get_user(extent_count,
1774 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1776 num_bytes = sizeof(*fiemap_s) + (extent_count *
1777 sizeof(struct ll_fiemap_extent));
1779 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1780 if (fiemap_s == NULL)
1783 /* get the fiemap value */
1784 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1786 GOTO(error, rc = -EFAULT);
1788 /* If fm_extent_count is non-zero, read the first extent since
1789 * it is used to calculate end_offset and device from previous
1792 if (copy_from_user(&fiemap_s->fm_extents[0],
1793 (char __user *)arg + sizeof(*fiemap_s),
1794 sizeof(struct ll_fiemap_extent)))
1795 GOTO(error, rc = -EFAULT);
1798 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1802 ret_bytes = sizeof(struct ll_user_fiemap);
1804 if (extent_count != 0)
1805 ret_bytes += (fiemap_s->fm_mapped_extents *
1806 sizeof(struct ll_fiemap_extent));
1808 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1812 OBD_FREE_LARGE(fiemap_s, num_bytes);
1817 * Read the data_version for inode.
1819 * This value is computed using stripe object version on OST.
1820 * Version is computed using server side locking.
1822 * @param extent_lock Take extent lock. Not needed if a process is already
1823 * holding the OST object group locks.
1825 int ll_data_version(struct inode *inode, __u64 *data_version,
1828 struct lov_stripe_md *lsm = NULL;
1829 struct ll_sb_info *sbi = ll_i2sbi(inode);
1830 struct obdo *obdo = NULL;
1834 /* If no stripe, we consider version is 0. */
1835 lsm = ccc_inode_lsm_get(inode);
1838 CDEBUG(D_INODE, "No object for inode\n");
1842 OBD_ALLOC_PTR(obdo);
1844 ccc_inode_lsm_put(inode, lsm);
1848 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1850 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1853 *data_version = obdo->o_data_version;
1857 ccc_inode_lsm_put(inode, lsm);
1862 struct ll_swap_stack {
1863 struct iattr ia1, ia2;
1865 struct inode *inode1, *inode2;
1866 bool check_dv1, check_dv2;
1869 static int ll_swap_layouts(struct file *file1, struct file *file2,
1870 struct lustre_swap_layouts *lsl)
1872 struct mdc_swap_layouts msl;
1873 struct md_op_data *op_data;
1876 struct ll_swap_stack *llss = NULL;
1879 OBD_ALLOC_PTR(llss);
1883 llss->inode1 = file1->f_dentry->d_inode;
1884 llss->inode2 = file2->f_dentry->d_inode;
1886 if (!S_ISREG(llss->inode2->i_mode))
1887 GOTO(free, rc = -EINVAL);
1889 if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1890 ll_permission(llss->inode2, MAY_WRITE, NULL))
1891 GOTO(free, rc = -EPERM);
1893 if (llss->inode2->i_sb != llss->inode1->i_sb)
1894 GOTO(free, rc = -EXDEV);
1896 /* we use 2 bool because it is easier to swap than 2 bits */
1897 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1898 llss->check_dv1 = true;
1900 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1901 llss->check_dv2 = true;
1903 /* we cannot use lsl->sl_dvX directly because we may swap them */
1904 llss->dv1 = lsl->sl_dv1;
1905 llss->dv2 = lsl->sl_dv2;
1907 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1908 if (rc == 0) /* same file, done! */
1911 if (rc < 0) { /* sequentialize it */
1912 swap(llss->inode1, llss->inode2);
1914 swap(llss->dv1, llss->dv2);
1915 swap(llss->check_dv1, llss->check_dv2);
1919 if (gid != 0) { /* application asks to flush dirty cache */
1920 rc = ll_get_grouplock(llss->inode1, file1, gid);
1924 rc = ll_get_grouplock(llss->inode2, file2, gid);
1926 ll_put_grouplock(llss->inode1, file1, gid);
1931 /* to be able to restore mtime and atime after swap
1932 * we need to first save them */
1934 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1935 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1936 llss->ia1.ia_atime = llss->inode1->i_atime;
1937 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1938 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1939 llss->ia2.ia_atime = llss->inode2->i_atime;
1940 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1943 /* ultimate check, before swaping the layouts we check if
1944 * dataversion has changed (if requested) */
1945 if (llss->check_dv1) {
1946 rc = ll_data_version(llss->inode1, &dv, 0);
1949 if (dv != llss->dv1)
1950 GOTO(putgl, rc = -EAGAIN);
1953 if (llss->check_dv2) {
1954 rc = ll_data_version(llss->inode2, &dv, 0);
1957 if (dv != llss->dv2)
1958 GOTO(putgl, rc = -EAGAIN);
1961 /* struct md_op_data is used to send the swap args to the mdt
1962 * only flags is missing, so we use struct mdc_swap_layouts
1963 * through the md_op_data->op_data */
1964 /* flags from user space have to be converted before they are send to
1965 * server, no flag is sent today, they are only used on the client */
1968 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1969 0, LUSTRE_OPC_ANY, &msl);
1970 if (op_data != NULL) {
1971 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
1972 ll_i2mdexp(llss->inode1),
1973 sizeof(*op_data), op_data, NULL);
1974 ll_finish_md_op_data(op_data);
1979 ll_put_grouplock(llss->inode2, file2, gid);
1980 ll_put_grouplock(llss->inode1, file1, gid);
1983 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1987 /* clear useless flags */
1988 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1989 llss->ia1.ia_valid &= ~ATTR_MTIME;
1990 llss->ia2.ia_valid &= ~ATTR_MTIME;
1993 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1994 llss->ia1.ia_valid &= ~ATTR_ATIME;
1995 llss->ia2.ia_valid &= ~ATTR_ATIME;
1998 /* update time if requested */
2000 if (llss->ia2.ia_valid != 0) {
2001 mutex_lock(&llss->inode1->i_mutex);
2002 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2003 mutex_unlock(&llss->inode1->i_mutex);
2006 if (llss->ia1.ia_valid != 0) {
2009 mutex_lock(&llss->inode2->i_mutex);
2010 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2011 mutex_unlock(&llss->inode2->i_mutex);
2023 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2025 struct inode *inode = file->f_dentry->d_inode;
2026 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2030 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2031 inode->i_generation, inode, cmd);
2032 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2034 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2035 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2039 case LL_IOC_GETFLAGS:
2040 /* Get the current value of the file flags */
2041 return put_user(fd->fd_flags, (int *)arg);
2042 case LL_IOC_SETFLAGS:
2043 case LL_IOC_CLRFLAGS:
2044 /* Set or clear specific file flags */
2045 /* XXX This probably needs checks to ensure the flags are
2046 * not abused, and to handle any flag side effects.
2048 if (get_user(flags, (int *) arg))
2051 if (cmd == LL_IOC_SETFLAGS) {
2052 if ((flags & LL_FILE_IGNORE_LOCK) &&
2053 !(file->f_flags & O_DIRECT)) {
2054 CERROR("%s: unable to disable locking on "
2055 "non-O_DIRECT file\n", current->comm);
2059 fd->fd_flags |= flags;
2061 fd->fd_flags &= ~flags;
2064 case LL_IOC_LOV_SETSTRIPE:
2065 RETURN(ll_lov_setstripe(inode, file, arg));
2066 case LL_IOC_LOV_SETEA:
2067 RETURN(ll_lov_setea(inode, file, arg));
2068 case LL_IOC_LOV_SWAP_LAYOUTS: {
2070 struct lustre_swap_layouts lsl;
2072 if (cfs_copy_from_user(&lsl, (char *)arg,
2073 sizeof(struct lustre_swap_layouts)))
2076 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2079 file2 = fget(lsl.sl_fd);
2084 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2085 rc = ll_swap_layouts(file, file2, &lsl);
2089 case LL_IOC_LOV_GETSTRIPE:
2090 RETURN(ll_lov_getstripe(inode, arg));
2091 case LL_IOC_RECREATE_OBJ:
2092 RETURN(ll_lov_recreate_obj(inode, arg));
2093 case LL_IOC_RECREATE_FID:
2094 RETURN(ll_lov_recreate_fid(inode, arg));
2095 case FSFILT_IOC_FIEMAP:
2096 RETURN(ll_ioctl_fiemap(inode, arg));
2097 case FSFILT_IOC_GETFLAGS:
2098 case FSFILT_IOC_SETFLAGS:
2099 RETURN(ll_iocontrol(inode, file, cmd, arg));
2100 case FSFILT_IOC_GETVERSION_OLD:
2101 case FSFILT_IOC_GETVERSION:
2102 RETURN(put_user(inode->i_generation, (int *)arg));
2103 case LL_IOC_GROUP_LOCK:
2104 RETURN(ll_get_grouplock(inode, file, arg));
2105 case LL_IOC_GROUP_UNLOCK:
2106 RETURN(ll_put_grouplock(inode, file, arg));
2107 case IOC_OBD_STATFS:
2108 RETURN(ll_obd_statfs(inode, (void *)arg));
2110 /* We need to special case any other ioctls we want to handle,
2111 * to send them to the MDS/OST as appropriate and to properly
2112 * network encode the arg field.
2113 case FSFILT_IOC_SETVERSION_OLD:
2114 case FSFILT_IOC_SETVERSION:
2116 case LL_IOC_FLUSHCTX:
2117 RETURN(ll_flush_ctx(inode));
2118 case LL_IOC_PATH2FID: {
2119 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2120 sizeof(struct lu_fid)))
2125 case OBD_IOC_FID2PATH:
2126 RETURN(ll_fid2path(inode, (void *)arg));
2127 case LL_IOC_DATA_VERSION: {
2128 struct ioc_data_version idv;
2131 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2134 rc = ll_data_version(inode, &idv.idv_version,
2135 !(idv.idv_flags & LL_DV_NOFLUSH));
2137 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2143 case LL_IOC_GET_MDTIDX: {
2146 mdtidx = ll_get_mdt_idx(inode);
2150 if (put_user((int)mdtidx, (int*)arg))
2155 case OBD_IOC_GETDTNAME:
2156 case OBD_IOC_GETMDNAME:
2157 RETURN(ll_get_obd_name(inode, cmd, arg));
2158 case LL_IOC_HSM_STATE_GET: {
2159 struct md_op_data *op_data;
2160 struct hsm_user_state *hus;
2167 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2168 LUSTRE_OPC_ANY, hus);
2169 if (op_data == NULL) {
2174 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2177 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2180 ll_finish_md_op_data(op_data);
2184 case LL_IOC_HSM_STATE_SET: {
2185 struct md_op_data *op_data;
2186 struct hsm_state_set *hss;
2192 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2197 /* Non-root users are forbidden to set or clear flags which are
2198 * NOT defined in HSM_USER_MASK. */
2199 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2200 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2205 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2206 LUSTRE_OPC_ANY, hss);
2207 if (op_data == NULL) {
2212 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2215 ll_finish_md_op_data(op_data);
2220 case LL_IOC_HSM_ACTION: {
2221 struct md_op_data *op_data;
2222 struct hsm_current_action *hca;
2229 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2230 LUSTRE_OPC_ANY, hca);
2231 if (op_data == NULL) {
2236 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2239 if (cfs_copy_to_user((char *)arg, hca, sizeof(*hca)))
2242 ll_finish_md_op_data(op_data);
2250 ll_iocontrol_call(inode, file, cmd, arg, &err))
2253 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2259 #ifndef HAVE_FILE_LLSEEK_SIZE
2260 static inline loff_t
2261 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2263 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2265 if (offset > maxsize)
2268 if (offset != file->f_pos) {
2269 file->f_pos = offset;
2270 file->f_version = 0;
2276 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2277 loff_t maxsize, loff_t eof)
2279 struct inode *inode = file->f_dentry->d_inode;
2287 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2288 * position-querying operation. Avoid rewriting the "same"
2289 * f_pos value back to the file because a concurrent read(),
2290 * write() or lseek() might have altered it
2295 * f_lock protects against read/modify/write race with other
2296 * SEEK_CURs. Note that parallel writes and reads behave
2299 mutex_lock(&inode->i_mutex);
2300 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2301 mutex_unlock(&inode->i_mutex);
2305 * In the generic case the entire file is data, so as long as
2306 * offset isn't at the end of the file then the offset is data.
2313 * There is a virtual hole at the end of the file, so as long as
2314 * offset isn't i_size or larger, return i_size.
2322 return llseek_execute(file, offset, maxsize);
2326 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2328 struct inode *inode = file->f_dentry->d_inode;
2329 loff_t retval, eof = 0;
2332 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2333 (origin == SEEK_CUR) ? file->f_pos : 0);
2334 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2335 inode->i_ino, inode->i_generation, inode, retval, retval,
2337 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2339 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2340 retval = ll_glimpse_size(inode);
2343 eof = i_size_read(inode);
2346 retval = ll_generic_file_llseek_size(file, offset, origin,
2347 ll_file_maxbytes(inode), eof);
2351 int ll_flush(struct file *file, fl_owner_t id)
2353 struct inode *inode = file->f_dentry->d_inode;
2354 struct ll_inode_info *lli = ll_i2info(inode);
2355 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2358 LASSERT(!S_ISDIR(inode->i_mode));
2360 /* catch async errors that were recorded back when async writeback
2361 * failed for pages in this mapping. */
2362 rc = lli->lli_async_rc;
2363 lli->lli_async_rc = 0;
2364 err = lov_read_and_clear_async_rc(lli->lli_clob);
2368 /* The application has been told write failure already.
2369 * Do not report failure again. */
2370 if (fd->fd_write_failed)
2372 return rc ? -EIO : 0;
2376 * Called to make sure a portion of file has been written out.
2377 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2379 * Return how many pages have been written.
2381 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2382 enum cl_fsync_mode mode, int ignore_layout)
2384 struct cl_env_nest nest;
2387 struct obd_capa *capa = NULL;
2388 struct cl_fsync_io *fio;
2392 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2393 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2396 env = cl_env_nested_get(&nest);
2398 RETURN(PTR_ERR(env));
2400 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2402 io = ccc_env_thread_io(env);
2403 io->ci_obj = cl_i2info(inode)->lli_clob;
2404 io->ci_ignore_layout = ignore_layout;
2406 /* initialize parameters for sync */
2407 fio = &io->u.ci_fsync;
2408 fio->fi_capa = capa;
2409 fio->fi_start = start;
2411 fio->fi_fid = ll_inode2fid(inode);
2412 fio->fi_mode = mode;
2413 fio->fi_nr_written = 0;
2415 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2416 result = cl_io_loop(env, io);
2418 result = io->ci_result;
2420 result = fio->fi_nr_written;
2421 cl_io_fini(env, io);
2422 cl_env_nested_put(&nest, env);
2430 * When dentry is provided (the 'else' case), *file->f_dentry may be
2431 * null and dentry must be used directly rather than pulled from
2432 * *file->f_dentry as is done otherwise.
2435 #ifdef HAVE_FILE_FSYNC_4ARGS
2436 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2438 struct dentry *dentry = file->f_dentry;
2439 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2440 int ll_fsync(struct file *file, int datasync)
2442 struct dentry *dentry = file->f_dentry;
2444 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2447 struct inode *inode = dentry->d_inode;
2448 struct ll_inode_info *lli = ll_i2info(inode);
2449 struct ptlrpc_request *req;
2450 struct obd_capa *oc;
2454 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2455 inode->i_generation, inode);
2456 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2458 #ifdef HAVE_FILE_FSYNC_4ARGS
2459 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2460 mutex_lock(&inode->i_mutex);
2462 /* fsync's caller has already called _fdata{sync,write}, we want
2463 * that IO to finish before calling the osc and mdc sync methods */
2464 rc = filemap_fdatawait(inode->i_mapping);
2467 /* catch async errors that were recorded back when async writeback
2468 * failed for pages in this mapping. */
2469 if (!S_ISDIR(inode->i_mode)) {
2470 err = lli->lli_async_rc;
2471 lli->lli_async_rc = 0;
2474 err = lov_read_and_clear_async_rc(lli->lli_clob);
2479 oc = ll_mdscapa_get(inode);
2480 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2486 ptlrpc_req_finished(req);
2488 if (datasync && S_ISREG(inode->i_mode)) {
2489 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2491 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2493 if (rc == 0 && err < 0)
2496 fd->fd_write_failed = true;
2498 fd->fd_write_failed = false;
2501 #ifdef HAVE_FILE_FSYNC_4ARGS
2502 mutex_unlock(&inode->i_mutex);
2507 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2509 struct inode *inode = file->f_dentry->d_inode;
2510 struct ll_sb_info *sbi = ll_i2sbi(inode);
2511 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2512 .ei_cb_cp =ldlm_flock_completion_ast,
2513 .ei_cbdata = file_lock };
2514 struct md_op_data *op_data;
2515 struct lustre_handle lockh = {0};
2516 ldlm_policy_data_t flock = {{0}};
2522 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2523 inode->i_ino, file_lock);
2525 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2527 if (file_lock->fl_flags & FL_FLOCK) {
2528 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2529 /* flocks are whole-file locks */
2530 flock.l_flock.end = OFFSET_MAX;
2531 /* For flocks owner is determined by the local file desctiptor*/
2532 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2533 } else if (file_lock->fl_flags & FL_POSIX) {
2534 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2535 flock.l_flock.start = file_lock->fl_start;
2536 flock.l_flock.end = file_lock->fl_end;
2540 flock.l_flock.pid = file_lock->fl_pid;
2542 /* Somewhat ugly workaround for svc lockd.
2543 * lockd installs custom fl_lmops->lm_compare_owner that checks
2544 * for the fl_owner to be the same (which it always is on local node
2545 * I guess between lockd processes) and then compares pid.
2546 * As such we assign pid to the owner field to make it all work,
2547 * conflict with normal locks is unlikely since pid space and
2548 * pointer space for current->files are not intersecting */
2549 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2550 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2552 switch (file_lock->fl_type) {
2554 einfo.ei_mode = LCK_PR;
2557 /* An unlock request may or may not have any relation to
2558 * existing locks so we may not be able to pass a lock handle
2559 * via a normal ldlm_lock_cancel() request. The request may even
2560 * unlock a byte range in the middle of an existing lock. In
2561 * order to process an unlock request we need all of the same
2562 * information that is given with a normal read or write record
2563 * lock request. To avoid creating another ldlm unlock (cancel)
2564 * message we'll treat a LCK_NL flock request as an unlock. */
2565 einfo.ei_mode = LCK_NL;
2568 einfo.ei_mode = LCK_PW;
2571 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2572 file_lock->fl_type);
2587 flags = LDLM_FL_BLOCK_NOWAIT;
2593 flags = LDLM_FL_TEST_LOCK;
2594 /* Save the old mode so that if the mode in the lock changes we
2595 * can decrement the appropriate reader or writer refcount. */
2596 file_lock->fl_type = einfo.ei_mode;
2599 CERROR("unknown fcntl lock command: %d\n", cmd);
2603 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2604 LUSTRE_OPC_ANY, NULL);
2605 if (IS_ERR(op_data))
2606 RETURN(PTR_ERR(op_data));
2608 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2609 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2610 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2612 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2613 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2615 if ((file_lock->fl_flags & FL_FLOCK) &&
2616 (rc == 0 || file_lock->fl_type == F_UNLCK))
2617 rc2 = flock_lock_file_wait(file, file_lock);
2618 if ((file_lock->fl_flags & FL_POSIX) &&
2619 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2620 !(flags & LDLM_FL_TEST_LOCK))
2621 rc2 = posix_lock_file_wait(file, file_lock);
2623 if (rc2 && file_lock->fl_type != F_UNLCK) {
2624 einfo.ei_mode = LCK_NL;
2625 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2626 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2630 ll_finish_md_op_data(op_data);
2635 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2643 * test if some locks matching bits and l_req_mode are acquired
2644 * - bits can be in different locks
2645 * - if found clear the common lock bits in *bits
2646 * - the bits not found, are kept in *bits
2648 * \param bits [IN] searched lock bits [IN]
2649 * \param l_req_mode [IN] searched lock mode
2650 * \retval boolean, true iff all bits are found
2652 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2654 struct lustre_handle lockh;
2655 ldlm_policy_data_t policy;
2656 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2657 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2666 fid = &ll_i2info(inode)->lli_fid;
2667 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2668 ldlm_lockname[mode]);
2670 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2671 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2672 policy.l_inodebits.bits = *bits & (1 << i);
2673 if (policy.l_inodebits.bits == 0)
2676 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2677 &policy, mode, &lockh)) {
2678 struct ldlm_lock *lock;
2680 lock = ldlm_handle2lock(&lockh);
2683 ~(lock->l_policy_data.l_inodebits.bits);
2684 LDLM_LOCK_PUT(lock);
2686 *bits &= ~policy.l_inodebits.bits;
2693 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2694 struct lustre_handle *lockh, __u64 flags)
2696 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2701 fid = &ll_i2info(inode)->lli_fid;
2702 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2704 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2705 fid, LDLM_IBITS, &policy,
2706 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2710 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2712 /* Already unlinked. Just update nlink and return success */
2713 if (rc == -ENOENT) {
2715 /* This path cannot be hit for regular files unless in
2716 * case of obscure races, so no need to to validate
2718 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2720 } else if (rc != 0) {
2721 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2722 ll_get_fsname(inode->i_sb, NULL, 0),
2723 PFID(ll_inode2fid(inode)), rc);
2729 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2732 struct inode *inode = dentry->d_inode;
2733 struct ptlrpc_request *req = NULL;
2734 struct obd_export *exp;
2738 LASSERT(inode != NULL);
2740 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2741 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2743 exp = ll_i2mdexp(inode);
2745 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2746 * But under CMD case, it caused some lock issues, should be fixed
2747 * with new CMD ibits lock. See bug 12718 */
2748 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2749 struct lookup_intent oit = { .it_op = IT_GETATTR };
2750 struct md_op_data *op_data;
2752 if (ibits == MDS_INODELOCK_LOOKUP)
2753 oit.it_op = IT_LOOKUP;
2755 /* Call getattr by fid, so do not provide name at all. */
2756 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2757 dentry->d_inode, NULL, 0, 0,
2758 LUSTRE_OPC_ANY, NULL);
2759 if (IS_ERR(op_data))
2760 RETURN(PTR_ERR(op_data));
2762 oit.it_create_mode |= M_CHECK_STALE;
2763 rc = md_intent_lock(exp, op_data, NULL, 0,
2764 /* we are not interested in name
2767 ll_md_blocking_ast, 0);
2768 ll_finish_md_op_data(op_data);
2769 oit.it_create_mode &= ~M_CHECK_STALE;
2771 rc = ll_inode_revalidate_fini(inode, rc);
2775 rc = ll_revalidate_it_finish(req, &oit, dentry);
2777 ll_intent_release(&oit);
2781 /* Unlinked? Unhash dentry, so it is not picked up later by
2782 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2783 here to preserve get_cwd functionality on 2.6.
2785 if (!dentry->d_inode->i_nlink)
2786 d_lustre_invalidate(dentry, 0);
2788 ll_lookup_finish_locks(&oit, dentry);
2789 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2790 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2791 obd_valid valid = OBD_MD_FLGETATTR;
2792 struct md_op_data *op_data;
2795 if (S_ISREG(inode->i_mode)) {
2796 rc = ll_get_max_mdsize(sbi, &ealen);
2799 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2802 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2803 0, ealen, LUSTRE_OPC_ANY,
2805 if (IS_ERR(op_data))
2806 RETURN(PTR_ERR(op_data));
2808 op_data->op_valid = valid;
2809 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2810 * capa for this inode. Because we only keep capas of dirs
2812 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2813 ll_finish_md_op_data(op_data);
2815 rc = ll_inode_revalidate_fini(inode, rc);
2819 rc = ll_prep_inode(&inode, req, NULL, NULL);
2822 ptlrpc_req_finished(req);
2826 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2829 struct inode *inode = dentry->d_inode;
2833 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2837 /* if object isn't regular file, don't validate size */
2838 if (!S_ISREG(inode->i_mode)) {
2839 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2840 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2841 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2843 rc = ll_glimpse_size(inode);
2848 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2849 struct lookup_intent *it, struct kstat *stat)
2851 struct inode *inode = de->d_inode;
2852 struct ll_sb_info *sbi = ll_i2sbi(inode);
2853 struct ll_inode_info *lli = ll_i2info(inode);
2856 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2857 MDS_INODELOCK_LOOKUP);
2858 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2863 stat->dev = inode->i_sb->s_dev;
2864 if (ll_need_32bit_api(sbi))
2865 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2867 stat->ino = inode->i_ino;
2868 stat->mode = inode->i_mode;
2869 stat->nlink = inode->i_nlink;
2870 stat->uid = inode->i_uid;
2871 stat->gid = inode->i_gid;
2872 stat->rdev = inode->i_rdev;
2873 stat->atime = inode->i_atime;
2874 stat->mtime = inode->i_mtime;
2875 stat->ctime = inode->i_ctime;
2876 stat->blksize = 1 << inode->i_blkbits;
2878 stat->size = i_size_read(inode);
2879 stat->blocks = inode->i_blocks;
2883 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2885 struct lookup_intent it = { .it_op = IT_GETATTR };
2887 return ll_getattr_it(mnt, de, &it, stat);
2890 #ifdef HAVE_LINUX_FIEMAP_H
2891 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2892 __u64 start, __u64 len)
2896 struct ll_user_fiemap *fiemap;
2897 unsigned int extent_count = fieinfo->fi_extents_max;
2899 num_bytes = sizeof(*fiemap) + (extent_count *
2900 sizeof(struct ll_fiemap_extent));
2901 OBD_ALLOC_LARGE(fiemap, num_bytes);
2906 fiemap->fm_flags = fieinfo->fi_flags;
2907 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2908 fiemap->fm_start = start;
2909 fiemap->fm_length = len;
2910 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2911 sizeof(struct ll_fiemap_extent));
2913 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2915 fieinfo->fi_flags = fiemap->fm_flags;
2916 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2917 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2918 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2920 OBD_FREE_LARGE(fiemap, num_bytes);
2925 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2927 struct ll_inode_info *lli = ll_i2info(inode);
2928 struct posix_acl *acl = NULL;
2931 spin_lock(&lli->lli_lock);
2932 /* VFS' acl_permission_check->check_acl will release the refcount */
2933 acl = posix_acl_dup(lli->lli_posix_acl);
2934 spin_unlock(&lli->lli_lock);
2939 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2941 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2942 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2944 ll_check_acl(struct inode *inode, int mask)
2947 # ifdef CONFIG_FS_POSIX_ACL
2948 struct posix_acl *acl;
2952 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2953 if (flags & IPERM_FLAG_RCU)
2956 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2961 rc = posix_acl_permission(inode, acl, mask);
2962 posix_acl_release(acl);
2965 # else /* !CONFIG_FS_POSIX_ACL */
2967 # endif /* CONFIG_FS_POSIX_ACL */
2969 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2971 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2972 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2974 # ifdef HAVE_INODE_PERMISION_2ARGS
2975 int ll_inode_permission(struct inode *inode, int mask)
2977 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2984 #ifdef MAY_NOT_BLOCK
2985 if (mask & MAY_NOT_BLOCK)
2987 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2988 if (flags & IPERM_FLAG_RCU)
2992 /* as root inode are NOT getting validated in lookup operation,
2993 * need to do it before permission check. */
2995 if (inode == inode->i_sb->s_root->d_inode) {
2996 struct lookup_intent it = { .it_op = IT_LOOKUP };
2998 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2999 MDS_INODELOCK_LOOKUP);
3004 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3005 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3007 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3008 return lustre_check_remote_perm(inode, mask);
3010 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3011 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3016 #ifdef HAVE_FILE_READV
3017 #define READ_METHOD readv
3018 #define READ_FUNCTION ll_file_readv
3019 #define WRITE_METHOD writev
3020 #define WRITE_FUNCTION ll_file_writev
3022 #define READ_METHOD aio_read
3023 #define READ_FUNCTION ll_file_aio_read
3024 #define WRITE_METHOD aio_write
3025 #define WRITE_FUNCTION ll_file_aio_write
3028 /* -o localflock - only provides locally consistent flock locks */
3029 struct file_operations ll_file_operations = {
3030 .read = ll_file_read,
3031 .READ_METHOD = READ_FUNCTION,
3032 .write = ll_file_write,
3033 .WRITE_METHOD = WRITE_FUNCTION,
3034 .unlocked_ioctl = ll_file_ioctl,
3035 .open = ll_file_open,
3036 .release = ll_file_release,
3037 .mmap = ll_file_mmap,
3038 .llseek = ll_file_seek,
3039 #ifdef HAVE_KERNEL_SENDFILE
3040 .sendfile = ll_file_sendfile,
3042 #ifdef HAVE_KERNEL_SPLICE_READ
3043 .splice_read = ll_file_splice_read,
3049 struct file_operations ll_file_operations_flock = {
3050 .read = ll_file_read,
3051 .READ_METHOD = READ_FUNCTION,
3052 .write = ll_file_write,
3053 .WRITE_METHOD = WRITE_FUNCTION,
3054 .unlocked_ioctl = ll_file_ioctl,
3055 .open = ll_file_open,
3056 .release = ll_file_release,
3057 .mmap = ll_file_mmap,
3058 .llseek = ll_file_seek,
3059 #ifdef HAVE_KERNEL_SENDFILE
3060 .sendfile = ll_file_sendfile,
3062 #ifdef HAVE_KERNEL_SPLICE_READ
3063 .splice_read = ll_file_splice_read,
3067 .flock = ll_file_flock,
3068 .lock = ll_file_flock
3071 /* These are for -o noflock - to return ENOSYS on flock calls */
3072 struct file_operations ll_file_operations_noflock = {
3073 .read = ll_file_read,
3074 .READ_METHOD = READ_FUNCTION,
3075 .write = ll_file_write,
3076 .WRITE_METHOD = WRITE_FUNCTION,
3077 .unlocked_ioctl = ll_file_ioctl,
3078 .open = ll_file_open,
3079 .release = ll_file_release,
3080 .mmap = ll_file_mmap,
3081 .llseek = ll_file_seek,
3082 #ifdef HAVE_KERNEL_SENDFILE
3083 .sendfile = ll_file_sendfile,
3085 #ifdef HAVE_KERNEL_SPLICE_READ
3086 .splice_read = ll_file_splice_read,
3090 .flock = ll_file_noflock,
3091 .lock = ll_file_noflock
3094 struct inode_operations ll_file_inode_operations = {
3095 .setattr = ll_setattr,
3096 .getattr = ll_getattr,
3097 .permission = ll_inode_permission,
3098 .setxattr = ll_setxattr,
3099 .getxattr = ll_getxattr,
3100 .listxattr = ll_listxattr,
3101 .removexattr = ll_removexattr,
3102 #ifdef HAVE_LINUX_FIEMAP_H
3103 .fiemap = ll_fiemap,
3105 #ifdef HAVE_IOP_GET_ACL
3106 .get_acl = ll_get_acl,
3110 /* dynamic ioctl number support routins */
3111 static struct llioc_ctl_data {
3112 struct rw_semaphore ioc_sem;
3113 cfs_list_t ioc_head;
3115 __RWSEM_INITIALIZER(llioc.ioc_sem),
3116 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3121 cfs_list_t iocd_list;
3122 unsigned int iocd_size;
3123 llioc_callback_t iocd_cb;
3124 unsigned int iocd_count;
3125 unsigned int iocd_cmd[0];
3128 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3131 struct llioc_data *in_data = NULL;
3134 if (cb == NULL || cmd == NULL ||
3135 count > LLIOC_MAX_CMD || count < 0)
3138 size = sizeof(*in_data) + count * sizeof(unsigned int);
3139 OBD_ALLOC(in_data, size);
3140 if (in_data == NULL)
3143 memset(in_data, 0, sizeof(*in_data));
3144 in_data->iocd_size = size;
3145 in_data->iocd_cb = cb;
3146 in_data->iocd_count = count;
3147 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3149 down_write(&llioc.ioc_sem);
3150 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3151 up_write(&llioc.ioc_sem);
3156 void ll_iocontrol_unregister(void *magic)
3158 struct llioc_data *tmp;
3163 down_write(&llioc.ioc_sem);
3164 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3166 unsigned int size = tmp->iocd_size;
3168 cfs_list_del(&tmp->iocd_list);
3169 up_write(&llioc.ioc_sem);
3171 OBD_FREE(tmp, size);
3175 up_write(&llioc.ioc_sem);
3177 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3180 EXPORT_SYMBOL(ll_iocontrol_register);
3181 EXPORT_SYMBOL(ll_iocontrol_unregister);
3183 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3184 unsigned int cmd, unsigned long arg, int *rcp)
3186 enum llioc_iter ret = LLIOC_CONT;
3187 struct llioc_data *data;
3188 int rc = -EINVAL, i;
3190 down_read(&llioc.ioc_sem);
3191 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3192 for (i = 0; i < data->iocd_count; i++) {
3193 if (cmd != data->iocd_cmd[i])
3196 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3200 if (ret == LLIOC_STOP)
3203 up_read(&llioc.ioc_sem);
3210 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3212 struct ll_inode_info *lli = ll_i2info(inode);
3213 struct cl_env_nest nest;
3218 if (lli->lli_clob == NULL)
3221 env = cl_env_nested_get(&nest);
3223 RETURN(PTR_ERR(env));
3225 result = cl_conf_set(env, lli->lli_clob, conf);
3226 cl_env_nested_put(&nest, env);
3228 if (conf->coc_opc == OBJECT_CONF_SET) {
3229 struct ldlm_lock *lock = conf->coc_lock;
3231 LASSERT(lock != NULL);
3232 LASSERT(ldlm_has_layout(lock));
3234 /* it can only be allowed to match after layout is
3235 * applied to inode otherwise false layout would be
3236 * seen. Applying layout shoud happen before dropping
3237 * the intent lock. */
3238 ldlm_lock_allow_match(lock);
3244 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3245 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3248 struct ll_sb_info *sbi = ll_i2sbi(inode);
3249 struct obd_capa *oc;
3250 struct ptlrpc_request *req;
3251 struct mdt_body *body;
3258 if (lock->l_lvb_data != NULL)
3261 /* if layout lock was granted right away, the layout is returned
3262 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3263 * blocked and then granted via completion ast, we have to fetch
3264 * layout here. Please note that we can't use the LVB buffer in
3265 * completion AST because it doesn't have a large enough buffer */
3266 oc = ll_mdscapa_get(inode);
3267 rc = ll_get_max_mdsize(sbi, &lmmsize);
3269 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3270 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3276 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3277 if (body == NULL || body->eadatasize > lmmsize)
3278 GOTO(out, rc = -EPROTO);
3280 lmmsize = body->eadatasize;
3281 if (lmmsize == 0) /* empty layout */
3284 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3286 GOTO(out, rc = -EFAULT);
3288 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3289 if (lvbdata == NULL)
3290 GOTO(out, rc = -ENOMEM);
3292 memcpy(lvbdata, lmm, lmmsize);
3293 lock_res_and_lock(lock);
3294 if (lock->l_lvb_data == NULL) {
3295 lock->l_lvb_data = lvbdata;
3296 lock->l_lvb_len = lmmsize;
3299 unlock_res_and_lock(lock);
3301 if (lvbdata != NULL)
3302 OBD_FREE_LARGE(lvbdata, lmmsize);
3306 ptlrpc_req_finished(req);
3311 * Apply the layout to the inode. Layout lock is held and will be released
3314 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3315 struct inode *inode, __u32 *gen, bool reconf)
3317 struct ll_inode_info *lli = ll_i2info(inode);
3318 struct ll_sb_info *sbi = ll_i2sbi(inode);
3319 struct ldlm_lock *lock;
3320 struct lustre_md md = { NULL };
3321 struct cl_object_conf conf;
3324 bool wait_layout = false;
3327 LASSERT(lustre_handle_is_used(lockh));
3329 lock = ldlm_handle2lock(lockh);
3330 LASSERT(lock != NULL);
3331 LASSERT(ldlm_has_layout(lock));
3333 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3334 inode, PFID(&lli->lli_fid), reconf);
3336 /* in case this is a caching lock and reinstate with new inode */
3337 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3339 lock_res_and_lock(lock);
3340 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3341 unlock_res_and_lock(lock);
3342 /* checking lvb_ready is racy but this is okay. The worst case is
3343 * that multi processes may configure the file on the same time. */
3344 if (lvb_ready || !reconf) {
3347 /* layout_gen must be valid if layout lock is not
3348 * cancelled and stripe has already set */
3349 *gen = lli->lli_layout_gen;
3355 rc = ll_layout_fetch(inode, lock);
3359 /* for layout lock, lmm is returned in lock's lvb.
3360 * lvb_data is immutable if the lock is held so it's safe to access it
3361 * without res lock. See the description in ldlm_lock_decref_internal()
3362 * for the condition to free lvb_data of layout lock */
3363 if (lock->l_lvb_data != NULL) {
3364 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3365 lock->l_lvb_data, lock->l_lvb_len);
3367 *gen = LL_LAYOUT_GEN_EMPTY;
3369 *gen = md.lsm->lsm_layout_gen;
3372 CERROR("%s: file "DFID" unpackmd error: %d\n",
3373 ll_get_fsname(inode->i_sb, NULL, 0),
3374 PFID(&lli->lli_fid), rc);
3380 /* set layout to file. Unlikely this will fail as old layout was
3381 * surely eliminated */
3382 memset(&conf, 0, sizeof conf);
3383 conf.coc_opc = OBJECT_CONF_SET;
3384 conf.coc_inode = inode;
3385 conf.coc_lock = lock;
3386 conf.u.coc_md = &md;
3387 rc = ll_layout_conf(inode, &conf);
3390 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3392 /* refresh layout failed, need to wait */
3393 wait_layout = rc == -EBUSY;
3397 LDLM_LOCK_PUT(lock);
3398 ldlm_lock_decref(lockh, mode);
3400 /* wait for IO to complete if it's still being used. */
3402 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3403 ll_get_fsname(inode->i_sb, NULL, 0),
3404 inode, PFID(&lli->lli_fid));
3406 memset(&conf, 0, sizeof conf);
3407 conf.coc_opc = OBJECT_CONF_WAIT;
3408 conf.coc_inode = inode;
3409 rc = ll_layout_conf(inode, &conf);
3413 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3414 PFID(&lli->lli_fid), rc);
3420 * This function checks if there exists a LAYOUT lock on the client side,
3421 * or enqueues it if it doesn't have one in cache.
3423 * This function will not hold layout lock so it may be revoked any time after
3424 * this function returns. Any operations depend on layout should be redone
3427 * This function should be called before lov_io_init() to get an uptodate
3428 * layout version, the caller should save the version number and after IO
3429 * is finished, this function should be called again to verify that layout
3430 * is not changed during IO time.
3432 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3434 struct ll_inode_info *lli = ll_i2info(inode);
3435 struct ll_sb_info *sbi = ll_i2sbi(inode);
3436 struct md_op_data *op_data;
3437 struct lookup_intent it;
3438 struct lustre_handle lockh;
3440 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3442 .ei_cb_bl = ll_md_blocking_ast,
3443 .ei_cb_cp = ldlm_completion_ast,
3444 .ei_cbdata = NULL };
3448 *gen = lli->lli_layout_gen;
3449 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3453 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3454 LASSERT(S_ISREG(inode->i_mode));
3456 /* mostly layout lock is caching on the local side, so try to match
3457 * it before grabbing layout lock mutex. */
3458 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3459 if (mode != 0) { /* hit cached lock */
3460 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3464 /* better hold lli_layout_mutex to try again otherwise
3465 * it will have starvation problem. */
3468 /* take layout lock mutex to enqueue layout lock exclusively. */
3469 mutex_lock(&lli->lli_layout_mutex);
3472 /* try again. Maybe somebody else has done this. */
3473 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3474 if (mode != 0) { /* hit cached lock */
3475 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3479 mutex_unlock(&lli->lli_layout_mutex);
3483 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3484 0, 0, LUSTRE_OPC_ANY, NULL);
3485 if (IS_ERR(op_data)) {
3486 mutex_unlock(&lli->lli_layout_mutex);
3487 RETURN(PTR_ERR(op_data));
3490 /* have to enqueue one */
3491 memset(&it, 0, sizeof(it));
3492 it.it_op = IT_LAYOUT;
3493 lockh.cookie = 0ULL;
3495 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3496 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3497 PFID(&lli->lli_fid));
3499 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3501 if (it.d.lustre.it_data != NULL)
3502 ptlrpc_req_finished(it.d.lustre.it_data);
3503 it.d.lustre.it_data = NULL;
3505 ll_finish_md_op_data(op_data);
3507 mode = it.d.lustre.it_lock_mode;
3508 it.d.lustre.it_lock_mode = 0;
3509 ll_intent_drop_lock(&it);
3512 /* set lock data in case this is a new lock */
3513 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3514 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3518 mutex_unlock(&lli->lli_layout_mutex);