4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och)
125 struct obd_export *exp = ll_i2mdexp(inode);
126 struct md_op_data *op_data;
127 struct ptlrpc_request *req = NULL;
128 struct obd_device *obd = class_exp2obd(exp);
135 * XXX: in case of LMV, is this correct to access
138 CERROR("Invalid MDC connection handle "LPX64"\n",
139 ll_i2mdexp(inode)->exp_handle.h_cookie);
143 OBD_ALLOC_PTR(op_data);
145 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
147 ll_prepare_close(inode, op_data, och);
148 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
149 rc = md_close(md_exp, op_data, och->och_mod, &req);
151 /* This close must have the epoch closed. */
152 LASSERT(epoch_close);
153 /* MDS has instructed us to obtain Size-on-MDS attribute from
154 * OSTs and send setattr to back to MDS. */
155 rc = ll_som_update(inode, op_data);
157 CERROR("inode %lu mdc Size-on-MDS update failed: "
158 "rc = %d\n", inode->i_ino, rc);
162 CERROR("inode %lu mdc close failed: rc = %d\n",
166 /* DATA_MODIFIED flag was successfully sent on close, cancel data
167 * modification flag. */
168 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
169 struct ll_inode_info *lli = ll_i2info(inode);
171 spin_lock(&lli->lli_lock);
172 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
173 spin_unlock(&lli->lli_lock);
176 ll_finish_md_op_data(op_data);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
188 if (exp_connect_som(exp) && !epoch_close &&
189 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
190 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
192 md_clear_open_replay_data(md_exp, och);
193 /* Free @och if it is not waiting for DONE_WRITING. */
194 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
197 if (req) /* This is close request */
198 ptlrpc_req_finished(req);
202 int ll_md_real_close(struct inode *inode, int flags)
204 struct ll_inode_info *lli = ll_i2info(inode);
205 struct obd_client_handle **och_p;
206 struct obd_client_handle *och;
211 if (flags & FMODE_WRITE) {
212 och_p = &lli->lli_mds_write_och;
213 och_usecount = &lli->lli_open_fd_write_count;
214 } else if (flags & FMODE_EXEC) {
215 och_p = &lli->lli_mds_exec_och;
216 och_usecount = &lli->lli_open_fd_exec_count;
218 LASSERT(flags & FMODE_READ);
219 och_p = &lli->lli_mds_read_och;
220 och_usecount = &lli->lli_open_fd_read_count;
223 mutex_lock(&lli->lli_och_mutex);
224 if (*och_usecount) { /* There are still users of this handle, so
226 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
233 if (och) { /* There might be a race and somebody have freed this och
235 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
242 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
254 /* Let's see if we have good enough OPEN lock on the file and if
255 we can skip talking to MDS */
256 if (file->f_dentry->d_inode) { /* Can this ever be false? */
258 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
259 struct lustre_handle lockh;
260 struct inode *inode = file->f_dentry->d_inode;
261 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
263 mutex_lock(&lli->lli_och_mutex);
264 if (fd->fd_omode & FMODE_WRITE) {
266 LASSERT(lli->lli_open_fd_write_count);
267 lli->lli_open_fd_write_count--;
268 } else if (fd->fd_omode & FMODE_EXEC) {
270 LASSERT(lli->lli_open_fd_exec_count);
271 lli->lli_open_fd_exec_count--;
274 LASSERT(lli->lli_open_fd_read_count);
275 lli->lli_open_fd_read_count--;
277 mutex_unlock(&lli->lli_och_mutex);
279 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
280 LDLM_IBITS, &policy, lockmode,
282 rc = ll_md_real_close(file->f_dentry->d_inode,
286 CERROR("Releasing a file %p with negative dentry %p. Name %s",
287 file, file->f_dentry, file->f_dentry->d_name.name);
290 LUSTRE_FPRIVATE(file) = NULL;
291 ll_file_data_put(fd);
292 ll_capa_close(inode);
297 /* While this returns an error code, fput() the caller does not, so we need
298 * to make every effort to clean up all of our state here. Also, applications
299 * rarely check close errors and even if an error is returned they will not
300 * re-try the close call.
302 int ll_file_release(struct inode *inode, struct file *file)
304 struct ll_file_data *fd;
305 struct ll_sb_info *sbi = ll_i2sbi(inode);
306 struct ll_inode_info *lli = ll_i2info(inode);
310 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
311 inode->i_generation, inode);
313 #ifdef CONFIG_FS_POSIX_ACL
314 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
315 inode == inode->i_sb->s_root->d_inode) {
316 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
319 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
320 fd->fd_flags &= ~LL_FILE_RMTACL;
321 rct_del(&sbi->ll_rct, cfs_curproc_pid());
322 et_search_free(&sbi->ll_et, cfs_curproc_pid());
327 if (inode->i_sb->s_root != file->f_dentry)
328 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
329 fd = LUSTRE_FPRIVATE(file);
332 /* The last ref on @file, maybe not the the owner pid of statahead.
333 * Different processes can open the same dir, "ll_opendir_key" means:
334 * it is me that should stop the statahead thread. */
335 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
336 lli->lli_opendir_pid != 0)
337 ll_stop_statahead(inode, lli->lli_opendir_key);
339 if (inode->i_sb->s_root == file->f_dentry) {
340 LUSTRE_FPRIVATE(file) = NULL;
341 ll_file_data_put(fd);
345 if (!S_ISDIR(inode->i_mode)) {
346 lov_read_and_clear_async_rc(lli->lli_clob);
347 lli->lli_async_rc = 0;
350 rc = ll_md_close(sbi->ll_md_exp, inode, file);
352 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
353 libcfs_debug_dumplog();
358 static int ll_intent_file_open(struct file *file, void *lmm,
359 int lmmsize, struct lookup_intent *itp)
361 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
362 struct dentry *parent = file->f_dentry->d_parent;
363 const char *name = file->f_dentry->d_name.name;
364 const int len = file->f_dentry->d_name.len;
365 struct md_op_data *op_data;
366 struct ptlrpc_request *req;
367 __u32 opc = LUSTRE_OPC_ANY;
374 /* Usually we come here only for NFSD, and we want open lock.
375 But we can also get here with pre 2.6.15 patchless kernels, and in
376 that case that lock is also ok */
377 /* We can also get here if there was cached open handle in revalidate_it
378 * but it disappeared while we were getting from there to ll_file_open.
379 * But this means this file was closed and immediatelly opened which
380 * makes a good candidate for using OPEN lock */
381 /* If lmmsize & lmm are not 0, we are just setting stripe info
382 * parameters. No need for the open lock */
383 if (lmm == NULL && lmmsize == 0) {
384 itp->it_flags |= MDS_OPEN_LOCK;
385 if (itp->it_flags & FMODE_WRITE)
386 opc = LUSTRE_OPC_CREATE;
389 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
390 file->f_dentry->d_inode, name, len,
393 RETURN(PTR_ERR(op_data));
395 itp->it_flags |= MDS_OPEN_BY_FID;
396 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
397 0 /*unused */, &req, ll_md_blocking_ast, 0);
398 ll_finish_md_op_data(op_data);
400 /* reason for keep own exit path - don`t flood log
401 * with messages with -ESTALE errors.
403 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
404 it_open_error(DISP_OPEN_OPEN, itp))
406 ll_release_openhandle(file->f_dentry, itp);
410 if (it_disposition(itp, DISP_LOOKUP_NEG))
411 GOTO(out, rc = -ENOENT);
413 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
414 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
415 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
419 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
420 if (!rc && itp->d.lustre.it_lock_mode)
421 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
425 ptlrpc_req_finished(itp->d.lustre.it_data);
426 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
427 ll_intent_drop_lock(itp);
433 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
434 * not believe attributes if a few ioepoch holders exist. Attributes for
435 * previous ioepoch if new one is opened are also skipped by MDS.
437 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
439 if (ioepoch && lli->lli_ioepoch != ioepoch) {
440 lli->lli_ioepoch = ioepoch;
441 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
442 ioepoch, PFID(&lli->lli_fid));
446 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
447 struct obd_client_handle *och)
449 struct ptlrpc_request *req = it->d.lustre.it_data;
450 struct mdt_body *body;
452 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
453 och->och_fh = body->handle;
454 och->och_fid = body->fid1;
455 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
456 och->och_flags = it->it_flags;
458 return md_set_open_replay_data(md_exp, och, req);
461 int ll_local_open(struct file *file, struct lookup_intent *it,
462 struct ll_file_data *fd, struct obd_client_handle *och)
464 struct inode *inode = file->f_dentry->d_inode;
465 struct ll_inode_info *lli = ll_i2info(inode);
468 LASSERT(!LUSTRE_FPRIVATE(file));
473 struct ptlrpc_request *req = it->d.lustre.it_data;
474 struct mdt_body *body;
477 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
481 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
482 ll_ioepoch_open(lli, body->ioepoch);
485 LUSTRE_FPRIVATE(file) = fd;
486 ll_readahead_init(inode, &fd->fd_ras);
487 fd->fd_omode = it->it_flags;
492 /* Open a file, and (for the very first open) create objects on the OSTs at
493 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
494 * creation or open until ll_lov_setstripe() ioctl is called.
496 * If we already have the stripe MD locally then we don't request it in
497 * md_open(), by passing a lmm_size = 0.
499 * It is up to the application to ensure no other processes open this file
500 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
501 * used. We might be able to avoid races of that sort by getting lli_open_sem
502 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
503 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
505 int ll_file_open(struct inode *inode, struct file *file)
507 struct ll_inode_info *lli = ll_i2info(inode);
508 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
509 .it_flags = file->f_flags };
510 struct obd_client_handle **och_p = NULL;
511 __u64 *och_usecount = NULL;
512 struct ll_file_data *fd;
513 int rc = 0, opendir_set = 0;
516 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
517 inode->i_generation, inode, file->f_flags);
519 it = file->private_data; /* XXX: compat macro */
520 file->private_data = NULL; /* prevent ll_local_open assertion */
522 fd = ll_file_data_get();
524 GOTO(out_openerr, rc = -ENOMEM);
527 if (S_ISDIR(inode->i_mode)) {
528 spin_lock(&lli->lli_sa_lock);
529 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
530 lli->lli_opendir_pid == 0) {
531 lli->lli_opendir_key = fd;
532 lli->lli_opendir_pid = cfs_curproc_pid();
535 spin_unlock(&lli->lli_sa_lock);
538 if (inode->i_sb->s_root == file->f_dentry) {
539 LUSTRE_FPRIVATE(file) = fd;
543 if (!it || !it->d.lustre.it_disposition) {
544 /* Convert f_flags into access mode. We cannot use file->f_mode,
545 * because everything but O_ACCMODE mask was stripped from
547 if ((oit.it_flags + 1) & O_ACCMODE)
549 if (file->f_flags & O_TRUNC)
550 oit.it_flags |= FMODE_WRITE;
552 /* kernel only call f_op->open in dentry_open. filp_open calls
553 * dentry_open after call to open_namei that checks permissions.
554 * Only nfsd_open call dentry_open directly without checking
555 * permissions and because of that this code below is safe. */
556 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
557 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
559 /* We do not want O_EXCL here, presumably we opened the file
560 * already? XXX - NFS implications? */
561 oit.it_flags &= ~O_EXCL;
563 /* bug20584, if "it_flags" contains O_CREAT, the file will be
564 * created if necessary, then "IT_CREAT" should be set to keep
565 * consistent with it */
566 if (oit.it_flags & O_CREAT)
567 oit.it_op |= IT_CREAT;
573 /* Let's see if we have file open on MDS already. */
574 if (it->it_flags & FMODE_WRITE) {
575 och_p = &lli->lli_mds_write_och;
576 och_usecount = &lli->lli_open_fd_write_count;
577 } else if (it->it_flags & FMODE_EXEC) {
578 och_p = &lli->lli_mds_exec_och;
579 och_usecount = &lli->lli_open_fd_exec_count;
581 och_p = &lli->lli_mds_read_och;
582 och_usecount = &lli->lli_open_fd_read_count;
585 mutex_lock(&lli->lli_och_mutex);
586 if (*och_p) { /* Open handle is present */
587 if (it_disposition(it, DISP_OPEN_OPEN)) {
588 /* Well, there's extra open request that we do not need,
589 let's close it somehow. This will decref request. */
590 rc = it_open_error(DISP_OPEN_OPEN, it);
592 mutex_unlock(&lli->lli_och_mutex);
593 GOTO(out_openerr, rc);
596 ll_release_openhandle(file->f_dentry, it);
600 rc = ll_local_open(file, it, fd, NULL);
603 mutex_unlock(&lli->lli_och_mutex);
604 GOTO(out_openerr, rc);
607 LASSERT(*och_usecount == 0);
608 if (!it->d.lustre.it_disposition) {
609 /* We cannot just request lock handle now, new ELC code
610 means that one of other OPEN locks for this file
611 could be cancelled, and since blocking ast handler
612 would attempt to grab och_mutex as well, that would
613 result in a deadlock */
614 mutex_unlock(&lli->lli_och_mutex);
615 it->it_create_mode |= M_CHECK_STALE;
616 rc = ll_intent_file_open(file, NULL, 0, it);
617 it->it_create_mode &= ~M_CHECK_STALE;
619 GOTO(out_openerr, rc);
623 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
625 GOTO(out_och_free, rc = -ENOMEM);
629 /* md_intent_lock() didn't get a request ref if there was an
630 * open error, so don't do cleanup on the request here
632 /* XXX (green): Should not we bail out on any error here, not
633 * just open error? */
634 rc = it_open_error(DISP_OPEN_OPEN, it);
636 GOTO(out_och_free, rc);
638 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
640 rc = ll_local_open(file, it, fd, *och_p);
642 GOTO(out_och_free, rc);
644 mutex_unlock(&lli->lli_och_mutex);
647 /* Must do this outside lli_och_mutex lock to prevent deadlock where
648 different kind of OPEN lock for this same inode gets cancelled
649 by ldlm_cancel_lru */
650 if (!S_ISREG(inode->i_mode))
651 GOTO(out_och_free, rc);
655 if (!lli->lli_has_smd) {
656 if (file->f_flags & O_LOV_DELAY_CREATE ||
657 !(file->f_mode & FMODE_WRITE)) {
658 CDEBUG(D_INODE, "object creation was delayed\n");
659 GOTO(out_och_free, rc);
662 file->f_flags &= ~O_LOV_DELAY_CREATE;
663 GOTO(out_och_free, rc);
667 if (och_p && *och_p) {
668 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
669 *och_p = NULL; /* OBD_FREE writes some magic there */
672 mutex_unlock(&lli->lli_och_mutex);
675 if (opendir_set != 0)
676 ll_stop_statahead(inode, lli->lli_opendir_key);
678 ll_file_data_put(fd);
680 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
683 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
684 ptlrpc_req_finished(it->d.lustre.it_data);
685 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
691 /* Fills the obdo with the attributes for the lsm */
692 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
693 struct obd_capa *capa, struct obdo *obdo,
694 __u64 ioepoch, int sync)
696 struct ptlrpc_request_set *set;
697 struct obd_info oinfo = { { { 0 } } };
702 LASSERT(lsm != NULL);
706 oinfo.oi_oa->o_oi = lsm->lsm_oi;
707 oinfo.oi_oa->o_mode = S_IFREG;
708 oinfo.oi_oa->o_ioepoch = ioepoch;
709 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
710 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
711 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
712 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
713 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
714 OBD_MD_FLDATAVERSION;
715 oinfo.oi_capa = capa;
717 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
718 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
721 set = ptlrpc_prep_set();
723 CERROR("can't allocate ptlrpc set\n");
726 rc = obd_getattr_async(exp, &oinfo, set);
728 rc = ptlrpc_set_wait(set);
729 ptlrpc_set_destroy(set);
732 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
733 OBD_MD_FLATIME | OBD_MD_FLMTIME |
734 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
735 OBD_MD_FLDATAVERSION);
740 * Performs the getattr on the inode and updates its fields.
741 * If @sync != 0, perform the getattr under the server-side lock.
743 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
744 __u64 ioepoch, int sync)
746 struct obd_capa *capa = ll_mdscapa_get(inode);
747 struct lov_stripe_md *lsm;
751 lsm = ccc_inode_lsm_get(inode);
752 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
753 capa, obdo, ioepoch, sync);
756 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
758 obdo_refresh_inode(inode, obdo, obdo->o_valid);
759 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
760 " blksize %lu\n", POSTID(oi), i_size_read(inode),
761 (unsigned long long)inode->i_blocks,
762 (unsigned long)ll_inode_blksize(inode));
764 ccc_inode_lsm_put(inode, lsm);
768 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
770 struct ll_inode_info *lli = ll_i2info(inode);
771 struct cl_object *obj = lli->lli_clob;
772 struct cl_attr *attr = ccc_env_thread_attr(env);
778 ll_inode_size_lock(inode);
779 /* merge timestamps the most recently obtained from mds with
780 timestamps obtained from osts */
781 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
782 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
783 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
784 inode_init_lvb(inode, &lvb);
786 cl_object_attr_lock(obj);
787 rc = cl_object_attr_get(env, obj, attr);
788 cl_object_attr_unlock(obj);
791 if (lvb.lvb_atime < attr->cat_atime)
792 lvb.lvb_atime = attr->cat_atime;
793 if (lvb.lvb_ctime < attr->cat_ctime)
794 lvb.lvb_ctime = attr->cat_ctime;
795 if (lvb.lvb_mtime < attr->cat_mtime)
796 lvb.lvb_mtime = attr->cat_mtime;
798 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
799 PFID(&lli->lli_fid), attr->cat_size);
800 cl_isize_write_nolock(inode, attr->cat_size);
802 inode->i_blocks = attr->cat_blocks;
804 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
805 LTIME_S(inode->i_atime) = lvb.lvb_atime;
806 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
808 ll_inode_size_unlock(inode);
813 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
816 struct obdo obdo = { 0 };
819 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
821 st->st_size = obdo.o_size;
822 st->st_blocks = obdo.o_blocks;
823 st->st_mtime = obdo.o_mtime;
824 st->st_atime = obdo.o_atime;
825 st->st_ctime = obdo.o_ctime;
830 void ll_io_init(struct cl_io *io, const struct file *file, int write)
832 struct inode *inode = file->f_dentry->d_inode;
834 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
836 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
837 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
838 file->f_flags & O_DIRECT ||
841 io->ci_obj = ll_i2info(inode)->lli_clob;
842 io->ci_lockreq = CILR_MAYBE;
843 if (ll_file_nolock(file)) {
844 io->ci_lockreq = CILR_NEVER;
845 io->ci_no_srvlock = 1;
846 } else if (file->f_flags & O_APPEND) {
847 io->ci_lockreq = CILR_MANDATORY;
852 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
853 struct file *file, enum cl_io_type iot,
854 loff_t *ppos, size_t count)
856 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
857 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
863 io = ccc_env_thread_io(env);
864 ll_io_init(io, file, iot == CIT_WRITE);
866 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
867 struct vvp_io *vio = vvp_env_io(env);
868 struct ccc_io *cio = ccc_env_io(env);
869 int write_mutex_locked = 0;
871 cio->cui_fd = LUSTRE_FPRIVATE(file);
872 vio->cui_io_subtype = args->via_io_subtype;
874 switch (vio->cui_io_subtype) {
876 cio->cui_iov = args->u.normal.via_iov;
877 cio->cui_nrsegs = args->u.normal.via_nrsegs;
878 cio->cui_tot_nrsegs = cio->cui_nrsegs;
879 #ifndef HAVE_FILE_WRITEV
880 cio->cui_iocb = args->u.normal.via_iocb;
882 if ((iot == CIT_WRITE) &&
883 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
884 if (mutex_lock_interruptible(&lli->
886 GOTO(out, result = -ERESTARTSYS);
887 write_mutex_locked = 1;
888 } else if (iot == CIT_READ) {
889 down_read(&lli->lli_trunc_sem);
893 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
894 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
897 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
898 vio->u.splice.cui_flags = args->u.splice.via_flags;
901 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
904 result = cl_io_loop(env, io);
905 if (write_mutex_locked)
906 mutex_unlock(&lli->lli_write_mutex);
907 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
908 up_read(&lli->lli_trunc_sem);
910 /* cl_io_rw_init() handled IO */
911 result = io->ci_result;
914 if (io->ci_nob > 0) {
916 *ppos = io->u.ci_wr.wr.crw_pos;
921 /* If any bit been read/written (result != 0), we just return
922 * short read/write instead of restart io. */
923 if (result == 0 && io->ci_need_restart) {
924 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
925 iot == CIT_READ ? "read" : "write",
926 file->f_dentry->d_name.name, *ppos, count);
927 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
931 if (iot == CIT_READ) {
933 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
934 LPROC_LL_READ_BYTES, result);
935 } else if (iot == CIT_WRITE) {
937 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
938 LPROC_LL_WRITE_BYTES, result);
939 fd->fd_write_failed = false;
940 } else if (result != -ERESTARTSYS) {
941 fd->fd_write_failed = true;
950 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
952 static int ll_file_get_iov_count(const struct iovec *iov,
953 unsigned long *nr_segs, size_t *count)
958 for (seg = 0; seg < *nr_segs; seg++) {
959 const struct iovec *iv = &iov[seg];
962 * If any segment has a negative length, or the cumulative
963 * length ever wraps negative then return -EINVAL.
966 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
968 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
973 cnt -= iv->iov_len; /* This segment is no good */
980 #ifdef HAVE_FILE_READV
981 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
982 unsigned long nr_segs, loff_t *ppos)
985 struct vvp_io_args *args;
991 result = ll_file_get_iov_count(iov, &nr_segs, &count);
995 env = cl_env_get(&refcheck);
997 RETURN(PTR_ERR(env));
999 args = vvp_env_args(env, IO_NORMAL);
1000 args->u.normal.via_iov = (struct iovec *)iov;
1001 args->u.normal.via_nrsegs = nr_segs;
1003 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1004 cl_env_put(env, &refcheck);
1008 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1012 struct iovec *local_iov;
1017 env = cl_env_get(&refcheck);
1019 RETURN(PTR_ERR(env));
1021 local_iov = &vvp_env_info(env)->vti_local_iov;
1022 local_iov->iov_base = (void __user *)buf;
1023 local_iov->iov_len = count;
1024 result = ll_file_readv(file, local_iov, 1, ppos);
1025 cl_env_put(env, &refcheck);
1030 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1031 unsigned long nr_segs, loff_t pos)
1034 struct vvp_io_args *args;
1040 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1044 env = cl_env_get(&refcheck);
1046 RETURN(PTR_ERR(env));
1048 args = vvp_env_args(env, IO_NORMAL);
1049 args->u.normal.via_iov = (struct iovec *)iov;
1050 args->u.normal.via_nrsegs = nr_segs;
1051 args->u.normal.via_iocb = iocb;
1053 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1054 &iocb->ki_pos, count);
1055 cl_env_put(env, &refcheck);
1059 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1063 struct iovec *local_iov;
1064 struct kiocb *kiocb;
1069 env = cl_env_get(&refcheck);
1071 RETURN(PTR_ERR(env));
1073 local_iov = &vvp_env_info(env)->vti_local_iov;
1074 kiocb = &vvp_env_info(env)->vti_kiocb;
1075 local_iov->iov_base = (void __user *)buf;
1076 local_iov->iov_len = count;
1077 init_sync_kiocb(kiocb, file);
1078 kiocb->ki_pos = *ppos;
1079 kiocb->ki_left = count;
1081 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1082 *ppos = kiocb->ki_pos;
1084 cl_env_put(env, &refcheck);
1090 * Write to a file (through the page cache).
1092 #ifdef HAVE_FILE_WRITEV
1093 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1094 unsigned long nr_segs, loff_t *ppos)
1097 struct vvp_io_args *args;
1103 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1107 env = cl_env_get(&refcheck);
1109 RETURN(PTR_ERR(env));
1111 args = vvp_env_args(env, IO_NORMAL);
1112 args->u.normal.via_iov = (struct iovec *)iov;
1113 args->u.normal.via_nrsegs = nr_segs;
1115 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1116 cl_env_put(env, &refcheck);
1120 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1124 struct iovec *local_iov;
1129 env = cl_env_get(&refcheck);
1131 RETURN(PTR_ERR(env));
1133 local_iov = &vvp_env_info(env)->vti_local_iov;
1134 local_iov->iov_base = (void __user *)buf;
1135 local_iov->iov_len = count;
1137 result = ll_file_writev(file, local_iov, 1, ppos);
1138 cl_env_put(env, &refcheck);
1142 #else /* AIO stuff */
1143 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1144 unsigned long nr_segs, loff_t pos)
1147 struct vvp_io_args *args;
1153 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1157 env = cl_env_get(&refcheck);
1159 RETURN(PTR_ERR(env));
1161 args = vvp_env_args(env, IO_NORMAL);
1162 args->u.normal.via_iov = (struct iovec *)iov;
1163 args->u.normal.via_nrsegs = nr_segs;
1164 args->u.normal.via_iocb = iocb;
1166 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1167 &iocb->ki_pos, count);
1168 cl_env_put(env, &refcheck);
1172 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1176 struct iovec *local_iov;
1177 struct kiocb *kiocb;
1182 env = cl_env_get(&refcheck);
1184 RETURN(PTR_ERR(env));
1186 local_iov = &vvp_env_info(env)->vti_local_iov;
1187 kiocb = &vvp_env_info(env)->vti_kiocb;
1188 local_iov->iov_base = (void __user *)buf;
1189 local_iov->iov_len = count;
1190 init_sync_kiocb(kiocb, file);
1191 kiocb->ki_pos = *ppos;
1192 kiocb->ki_left = count;
1194 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1195 *ppos = kiocb->ki_pos;
1197 cl_env_put(env, &refcheck);
1203 #ifdef HAVE_KERNEL_SENDFILE
1205 * Send file content (through pagecache) somewhere with helper
1207 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1208 read_actor_t actor, void *target)
1211 struct vvp_io_args *args;
1216 env = cl_env_get(&refcheck);
1218 RETURN(PTR_ERR(env));
1220 args = vvp_env_args(env, IO_SENDFILE);
1221 args->u.sendfile.via_target = target;
1222 args->u.sendfile.via_actor = actor;
1224 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1225 cl_env_put(env, &refcheck);
1230 #ifdef HAVE_KERNEL_SPLICE_READ
1232 * Send file content (through pagecache) somewhere with helper
1234 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1235 struct pipe_inode_info *pipe, size_t count,
1239 struct vvp_io_args *args;
1244 env = cl_env_get(&refcheck);
1246 RETURN(PTR_ERR(env));
1248 args = vvp_env_args(env, IO_SPLICE);
1249 args->u.splice.via_pipe = pipe;
1250 args->u.splice.via_flags = flags;
1252 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1253 cl_env_put(env, &refcheck);
1258 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1261 struct obd_export *exp = ll_i2dtexp(inode);
1262 struct obd_trans_info oti = { 0 };
1263 struct obdo *oa = NULL;
1266 struct lov_stripe_md *lsm = NULL, *lsm2;
1273 lsm = ccc_inode_lsm_get(inode);
1275 GOTO(out, rc = -ENOENT);
1277 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1278 (lsm->lsm_stripe_count));
1280 OBD_ALLOC_LARGE(lsm2, lsm_size);
1282 GOTO(out, rc = -ENOMEM);
1285 oa->o_nlink = ost_idx;
1286 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1287 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1288 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1289 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1290 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1291 memcpy(lsm2, lsm, lsm_size);
1292 ll_inode_size_lock(inode);
1293 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1294 ll_inode_size_unlock(inode);
1296 OBD_FREE_LARGE(lsm2, lsm_size);
1299 ccc_inode_lsm_put(inode, lsm);
1304 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1306 struct ll_recreate_obj ucreat;
1310 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1313 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1317 ostid_set_seq_mdt0(&oi);
1318 ostid_set_id(&oi, ucreat.lrc_id);
1319 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1322 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1329 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1332 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1335 fid_to_ostid(&fid, &oi);
1336 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1337 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1340 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1341 int flags, struct lov_user_md *lum, int lum_size)
1343 struct lov_stripe_md *lsm = NULL;
1344 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1348 lsm = ccc_inode_lsm_get(inode);
1350 ccc_inode_lsm_put(inode, lsm);
1351 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1356 ll_inode_size_lock(inode);
1357 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1360 rc = oit.d.lustre.it_status;
1362 GOTO(out_req_free, rc);
1364 ll_release_openhandle(file->f_dentry, &oit);
1367 ll_inode_size_unlock(inode);
1368 ll_intent_release(&oit);
1369 ccc_inode_lsm_put(inode, lsm);
1372 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1376 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1377 struct lov_mds_md **lmmp, int *lmm_size,
1378 struct ptlrpc_request **request)
1380 struct ll_sb_info *sbi = ll_i2sbi(inode);
1381 struct mdt_body *body;
1382 struct lov_mds_md *lmm = NULL;
1383 struct ptlrpc_request *req = NULL;
1384 struct md_op_data *op_data;
1387 rc = ll_get_max_mdsize(sbi, &lmmsize);
1391 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1392 strlen(filename), lmmsize,
1393 LUSTRE_OPC_ANY, NULL);
1394 if (IS_ERR(op_data))
1395 RETURN(PTR_ERR(op_data));
1397 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1398 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1399 ll_finish_md_op_data(op_data);
1401 CDEBUG(D_INFO, "md_getattr_name failed "
1402 "on %s: rc %d\n", filename, rc);
1406 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1407 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1409 lmmsize = body->eadatasize;
1411 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1413 GOTO(out, rc = -ENODATA);
1416 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1417 LASSERT(lmm != NULL);
1419 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1420 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1421 GOTO(out, rc = -EPROTO);
1425 * This is coming from the MDS, so is probably in
1426 * little endian. We convert it to host endian before
1427 * passing it to userspace.
1429 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1430 /* if function called for directory - we should
1431 * avoid swab not existent lsm objects */
1432 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1433 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1434 if (S_ISREG(body->mode))
1435 lustre_swab_lov_user_md_objects(
1436 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1437 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1438 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1439 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1440 if (S_ISREG(body->mode))
1441 lustre_swab_lov_user_md_objects(
1442 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1443 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1449 *lmm_size = lmmsize;
1454 static int ll_lov_setea(struct inode *inode, struct file *file,
1457 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1458 struct lov_user_md *lump;
1459 int lum_size = sizeof(struct lov_user_md) +
1460 sizeof(struct lov_user_ost_data);
1464 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1467 OBD_ALLOC_LARGE(lump, lum_size);
1471 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1472 OBD_FREE_LARGE(lump, lum_size);
1476 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1478 OBD_FREE_LARGE(lump, lum_size);
1482 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1485 struct lov_user_md_v3 lumv3;
1486 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1487 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1488 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1490 int flags = FMODE_WRITE;
1493 /* first try with v1 which is smaller than v3 */
1494 lum_size = sizeof(struct lov_user_md_v1);
1495 if (copy_from_user(lumv1, lumv1p, lum_size))
1498 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1499 lum_size = sizeof(struct lov_user_md_v3);
1500 if (copy_from_user(&lumv3, lumv3p, lum_size))
1504 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1506 struct lov_stripe_md *lsm;
1509 put_user(0, &lumv1p->lmm_stripe_count);
1511 ll_layout_refresh(inode, &gen);
1512 lsm = ccc_inode_lsm_get(inode);
1513 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1514 0, lsm, (void *)arg);
1515 ccc_inode_lsm_put(inode, lsm);
1520 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1522 struct lov_stripe_md *lsm;
1526 lsm = ccc_inode_lsm_get(inode);
1528 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1530 ccc_inode_lsm_put(inode, lsm);
1534 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1536 struct ll_inode_info *lli = ll_i2info(inode);
1537 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1538 struct ccc_grouplock grouplock;
1542 if (ll_file_nolock(file))
1543 RETURN(-EOPNOTSUPP);
1545 spin_lock(&lli->lli_lock);
1546 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1547 CWARN("group lock already existed with gid %lu\n",
1548 fd->fd_grouplock.cg_gid);
1549 spin_unlock(&lli->lli_lock);
1552 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1553 spin_unlock(&lli->lli_lock);
1555 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1556 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1560 spin_lock(&lli->lli_lock);
1561 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1562 spin_unlock(&lli->lli_lock);
1563 CERROR("another thread just won the race\n");
1564 cl_put_grouplock(&grouplock);
1568 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1569 fd->fd_grouplock = grouplock;
1570 spin_unlock(&lli->lli_lock);
1572 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1576 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1578 struct ll_inode_info *lli = ll_i2info(inode);
1579 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1580 struct ccc_grouplock grouplock;
1583 spin_lock(&lli->lli_lock);
1584 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1585 spin_unlock(&lli->lli_lock);
1586 CWARN("no group lock held\n");
1589 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1591 if (fd->fd_grouplock.cg_gid != arg) {
1592 CWARN("group lock %lu doesn't match current id %lu\n",
1593 arg, fd->fd_grouplock.cg_gid);
1594 spin_unlock(&lli->lli_lock);
1598 grouplock = fd->fd_grouplock;
1599 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1600 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1601 spin_unlock(&lli->lli_lock);
1603 cl_put_grouplock(&grouplock);
1604 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1609 * Close inode open handle
1611 * \param dentry [in] dentry which contains the inode
1612 * \param it [in,out] intent which contains open info and result
1615 * \retval <0 failure
1617 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1619 struct inode *inode = dentry->d_inode;
1620 struct obd_client_handle *och;
1626 /* Root ? Do nothing. */
1627 if (dentry->d_inode->i_sb->s_root == dentry)
1630 /* No open handle to close? Move away */
1631 if (!it_disposition(it, DISP_OPEN_OPEN))
1634 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1636 OBD_ALLOC(och, sizeof(*och));
1638 GOTO(out, rc = -ENOMEM);
1640 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1642 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1645 /* this one is in place of ll_file_open */
1646 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1647 ptlrpc_req_finished(it->d.lustre.it_data);
1648 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1654 * Get size for inode for which FIEMAP mapping is requested.
1655 * Make the FIEMAP get_info call and returns the result.
1657 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1660 struct obd_export *exp = ll_i2dtexp(inode);
1661 struct lov_stripe_md *lsm = NULL;
1662 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1663 int vallen = num_bytes;
1667 /* Checks for fiemap flags */
1668 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1669 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1673 /* Check for FIEMAP_FLAG_SYNC */
1674 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1675 rc = filemap_fdatawrite(inode->i_mapping);
1680 lsm = ccc_inode_lsm_get(inode);
1684 /* If the stripe_count > 1 and the application does not understand
1685 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1687 if (lsm->lsm_stripe_count > 1 &&
1688 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1689 GOTO(out, rc = -EOPNOTSUPP);
1691 fm_key.oa.o_oi = lsm->lsm_oi;
1692 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1694 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1695 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1696 /* If filesize is 0, then there would be no objects for mapping */
1697 if (fm_key.oa.o_size == 0) {
1698 fiemap->fm_mapped_extents = 0;
1702 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1704 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1707 CERROR("obd_get_info failed: rc = %d\n", rc);
1710 ccc_inode_lsm_put(inode, lsm);
1714 int ll_fid2path(struct inode *inode, void *arg)
1716 struct obd_export *exp = ll_i2mdexp(inode);
1717 struct getinfo_fid2path *gfout, *gfin;
1721 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1722 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1725 /* Need to get the buflen */
1726 OBD_ALLOC_PTR(gfin);
1729 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1734 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1735 OBD_ALLOC(gfout, outsize);
1736 if (gfout == NULL) {
1740 memcpy(gfout, gfin, sizeof(*gfout));
1743 /* Call mdc_iocontrol */
1744 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1748 if (copy_to_user(arg, gfout, outsize))
1752 OBD_FREE(gfout, outsize);
1756 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1758 struct ll_user_fiemap *fiemap_s;
1759 size_t num_bytes, ret_bytes;
1760 unsigned int extent_count;
1763 /* Get the extent count so we can calculate the size of
1764 * required fiemap buffer */
1765 if (get_user(extent_count,
1766 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1768 num_bytes = sizeof(*fiemap_s) + (extent_count *
1769 sizeof(struct ll_fiemap_extent));
1771 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1772 if (fiemap_s == NULL)
1775 /* get the fiemap value */
1776 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1778 GOTO(error, rc = -EFAULT);
1780 /* If fm_extent_count is non-zero, read the first extent since
1781 * it is used to calculate end_offset and device from previous
1784 if (copy_from_user(&fiemap_s->fm_extents[0],
1785 (char __user *)arg + sizeof(*fiemap_s),
1786 sizeof(struct ll_fiemap_extent)))
1787 GOTO(error, rc = -EFAULT);
1790 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1794 ret_bytes = sizeof(struct ll_user_fiemap);
1796 if (extent_count != 0)
1797 ret_bytes += (fiemap_s->fm_mapped_extents *
1798 sizeof(struct ll_fiemap_extent));
1800 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1804 OBD_FREE_LARGE(fiemap_s, num_bytes);
1809 * Read the data_version for inode.
1811 * This value is computed using stripe object version on OST.
1812 * Version is computed using server side locking.
1814 * @param extent_lock Take extent lock. Not needed if a process is already
1815 * holding the OST object group locks.
1817 int ll_data_version(struct inode *inode, __u64 *data_version,
1820 struct lov_stripe_md *lsm = NULL;
1821 struct ll_sb_info *sbi = ll_i2sbi(inode);
1822 struct obdo *obdo = NULL;
1826 /* If no stripe, we consider version is 0. */
1827 lsm = ccc_inode_lsm_get(inode);
1830 CDEBUG(D_INODE, "No object for inode\n");
1834 OBD_ALLOC_PTR(obdo);
1836 ccc_inode_lsm_put(inode, lsm);
1840 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1842 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1845 *data_version = obdo->o_data_version;
1849 ccc_inode_lsm_put(inode, lsm);
1854 struct ll_swap_stack {
1855 struct iattr ia1, ia2;
1857 struct inode *inode1, *inode2;
1858 bool check_dv1, check_dv2;
1861 static int ll_swap_layouts(struct file *file1, struct file *file2,
1862 struct lustre_swap_layouts *lsl)
1864 struct mdc_swap_layouts msl;
1865 struct md_op_data *op_data;
1868 struct ll_swap_stack *llss = NULL;
1871 OBD_ALLOC_PTR(llss);
1875 llss->inode1 = file1->f_dentry->d_inode;
1876 llss->inode2 = file2->f_dentry->d_inode;
1878 if (!S_ISREG(llss->inode2->i_mode))
1879 GOTO(free, rc = -EINVAL);
1881 if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1882 ll_permission(llss->inode2, MAY_WRITE, NULL))
1883 GOTO(free, rc = -EPERM);
1885 if (llss->inode2->i_sb != llss->inode1->i_sb)
1886 GOTO(free, rc = -EXDEV);
1888 /* we use 2 bool because it is easier to swap than 2 bits */
1889 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1890 llss->check_dv1 = true;
1892 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1893 llss->check_dv2 = true;
1895 /* we cannot use lsl->sl_dvX directly because we may swap them */
1896 llss->dv1 = lsl->sl_dv1;
1897 llss->dv2 = lsl->sl_dv2;
1899 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1900 if (rc == 0) /* same file, done! */
1903 if (rc < 0) { /* sequentialize it */
1904 swap(llss->inode1, llss->inode2);
1906 swap(llss->dv1, llss->dv2);
1907 swap(llss->check_dv1, llss->check_dv2);
1911 if (gid != 0) { /* application asks to flush dirty cache */
1912 rc = ll_get_grouplock(llss->inode1, file1, gid);
1916 rc = ll_get_grouplock(llss->inode2, file2, gid);
1918 ll_put_grouplock(llss->inode1, file1, gid);
1923 /* to be able to restore mtime and atime after swap
1924 * we need to first save them */
1926 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1927 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1928 llss->ia1.ia_atime = llss->inode1->i_atime;
1929 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1930 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1931 llss->ia2.ia_atime = llss->inode2->i_atime;
1932 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1935 /* ultimate check, before swaping the layouts we check if
1936 * dataversion has changed (if requested) */
1937 if (llss->check_dv1) {
1938 rc = ll_data_version(llss->inode1, &dv, 0);
1941 if (dv != llss->dv1)
1942 GOTO(putgl, rc = -EAGAIN);
1945 if (llss->check_dv2) {
1946 rc = ll_data_version(llss->inode2, &dv, 0);
1949 if (dv != llss->dv2)
1950 GOTO(putgl, rc = -EAGAIN);
1953 /* struct md_op_data is used to send the swap args to the mdt
1954 * only flags is missing, so we use struct mdc_swap_layouts
1955 * through the md_op_data->op_data */
1956 /* flags from user space have to be converted before they are send to
1957 * server, no flag is sent today, they are only used on the client */
1960 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1961 0, LUSTRE_OPC_ANY, &msl);
1962 if (IS_ERR(op_data))
1963 GOTO(free, rc = PTR_ERR(op_data));
1965 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1966 sizeof(*op_data), op_data, NULL);
1967 ll_finish_md_op_data(op_data);
1971 ll_put_grouplock(llss->inode2, file2, gid);
1972 ll_put_grouplock(llss->inode1, file1, gid);
1975 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1979 /* clear useless flags */
1980 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1981 llss->ia1.ia_valid &= ~ATTR_MTIME;
1982 llss->ia2.ia_valid &= ~ATTR_MTIME;
1985 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1986 llss->ia1.ia_valid &= ~ATTR_ATIME;
1987 llss->ia2.ia_valid &= ~ATTR_ATIME;
1990 /* update time if requested */
1992 if (llss->ia2.ia_valid != 0) {
1993 mutex_lock(&llss->inode1->i_mutex);
1994 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1995 mutex_unlock(&llss->inode1->i_mutex);
1998 if (llss->ia1.ia_valid != 0) {
2001 mutex_lock(&llss->inode2->i_mutex);
2002 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2003 mutex_unlock(&llss->inode2->i_mutex);
2015 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2017 struct inode *inode = file->f_dentry->d_inode;
2018 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2022 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2023 inode->i_generation, inode, cmd);
2024 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2026 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2027 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2031 case LL_IOC_GETFLAGS:
2032 /* Get the current value of the file flags */
2033 return put_user(fd->fd_flags, (int *)arg);
2034 case LL_IOC_SETFLAGS:
2035 case LL_IOC_CLRFLAGS:
2036 /* Set or clear specific file flags */
2037 /* XXX This probably needs checks to ensure the flags are
2038 * not abused, and to handle any flag side effects.
2040 if (get_user(flags, (int *) arg))
2043 if (cmd == LL_IOC_SETFLAGS) {
2044 if ((flags & LL_FILE_IGNORE_LOCK) &&
2045 !(file->f_flags & O_DIRECT)) {
2046 CERROR("%s: unable to disable locking on "
2047 "non-O_DIRECT file\n", current->comm);
2051 fd->fd_flags |= flags;
2053 fd->fd_flags &= ~flags;
2056 case LL_IOC_LOV_SETSTRIPE:
2057 RETURN(ll_lov_setstripe(inode, file, arg));
2058 case LL_IOC_LOV_SETEA:
2059 RETURN(ll_lov_setea(inode, file, arg));
2060 case LL_IOC_LOV_SWAP_LAYOUTS: {
2062 struct lustre_swap_layouts lsl;
2064 if (cfs_copy_from_user(&lsl, (char *)arg,
2065 sizeof(struct lustre_swap_layouts)))
2068 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2071 file2 = fget(lsl.sl_fd);
2076 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2077 rc = ll_swap_layouts(file, file2, &lsl);
2081 case LL_IOC_LOV_GETSTRIPE:
2082 RETURN(ll_lov_getstripe(inode, arg));
2083 case LL_IOC_RECREATE_OBJ:
2084 RETURN(ll_lov_recreate_obj(inode, arg));
2085 case LL_IOC_RECREATE_FID:
2086 RETURN(ll_lov_recreate_fid(inode, arg));
2087 case FSFILT_IOC_FIEMAP:
2088 RETURN(ll_ioctl_fiemap(inode, arg));
2089 case FSFILT_IOC_GETFLAGS:
2090 case FSFILT_IOC_SETFLAGS:
2091 RETURN(ll_iocontrol(inode, file, cmd, arg));
2092 case FSFILT_IOC_GETVERSION_OLD:
2093 case FSFILT_IOC_GETVERSION:
2094 RETURN(put_user(inode->i_generation, (int *)arg));
2095 case LL_IOC_GROUP_LOCK:
2096 RETURN(ll_get_grouplock(inode, file, arg));
2097 case LL_IOC_GROUP_UNLOCK:
2098 RETURN(ll_put_grouplock(inode, file, arg));
2099 case IOC_OBD_STATFS:
2100 RETURN(ll_obd_statfs(inode, (void *)arg));
2102 /* We need to special case any other ioctls we want to handle,
2103 * to send them to the MDS/OST as appropriate and to properly
2104 * network encode the arg field.
2105 case FSFILT_IOC_SETVERSION_OLD:
2106 case FSFILT_IOC_SETVERSION:
2108 case LL_IOC_FLUSHCTX:
2109 RETURN(ll_flush_ctx(inode));
2110 case LL_IOC_PATH2FID: {
2111 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2112 sizeof(struct lu_fid)))
2117 case OBD_IOC_FID2PATH:
2118 RETURN(ll_fid2path(inode, (void *)arg));
2119 case LL_IOC_DATA_VERSION: {
2120 struct ioc_data_version idv;
2123 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2126 rc = ll_data_version(inode, &idv.idv_version,
2127 !(idv.idv_flags & LL_DV_NOFLUSH));
2129 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2135 case LL_IOC_GET_MDTIDX: {
2138 mdtidx = ll_get_mdt_idx(inode);
2142 if (put_user((int)mdtidx, (int*)arg))
2147 case OBD_IOC_GETDTNAME:
2148 case OBD_IOC_GETMDNAME:
2149 RETURN(ll_get_obd_name(inode, cmd, arg));
2150 case LL_IOC_HSM_STATE_GET: {
2151 struct md_op_data *op_data;
2152 struct hsm_user_state *hus;
2159 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2160 LUSTRE_OPC_ANY, hus);
2161 if (IS_ERR(op_data)) {
2163 RETURN(PTR_ERR(op_data));
2166 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2169 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2172 ll_finish_md_op_data(op_data);
2176 case LL_IOC_HSM_STATE_SET: {
2177 struct md_op_data *op_data;
2178 struct hsm_state_set *hss;
2184 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2189 /* Non-root users are forbidden to set or clear flags which are
2190 * NOT defined in HSM_USER_MASK. */
2191 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2192 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2197 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2198 LUSTRE_OPC_ANY, hss);
2199 if (IS_ERR(op_data)) {
2201 RETURN(PTR_ERR(op_data));
2204 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2207 ll_finish_md_op_data(op_data);
2212 case LL_IOC_HSM_ACTION: {
2213 struct md_op_data *op_data;
2214 struct hsm_current_action *hca;
2221 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2222 LUSTRE_OPC_ANY, hca);
2223 if (IS_ERR(op_data)) {
2225 RETURN(PTR_ERR(op_data));
2228 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2231 if (cfs_copy_to_user((char *)arg, hca, sizeof(*hca)))
2234 ll_finish_md_op_data(op_data);
2242 ll_iocontrol_call(inode, file, cmd, arg, &err))
2245 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2251 #ifndef HAVE_FILE_LLSEEK_SIZE
2252 static inline loff_t
2253 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2255 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2257 if (offset > maxsize)
2260 if (offset != file->f_pos) {
2261 file->f_pos = offset;
2262 file->f_version = 0;
2268 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2269 loff_t maxsize, loff_t eof)
2271 struct inode *inode = file->f_dentry->d_inode;
2279 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2280 * position-querying operation. Avoid rewriting the "same"
2281 * f_pos value back to the file because a concurrent read(),
2282 * write() or lseek() might have altered it
2287 * f_lock protects against read/modify/write race with other
2288 * SEEK_CURs. Note that parallel writes and reads behave
2291 mutex_lock(&inode->i_mutex);
2292 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2293 mutex_unlock(&inode->i_mutex);
2297 * In the generic case the entire file is data, so as long as
2298 * offset isn't at the end of the file then the offset is data.
2305 * There is a virtual hole at the end of the file, so as long as
2306 * offset isn't i_size or larger, return i_size.
2314 return llseek_execute(file, offset, maxsize);
2318 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2320 struct inode *inode = file->f_dentry->d_inode;
2321 loff_t retval, eof = 0;
2324 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2325 (origin == SEEK_CUR) ? file->f_pos : 0);
2326 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2327 inode->i_ino, inode->i_generation, inode, retval, retval,
2329 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2331 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2332 retval = ll_glimpse_size(inode);
2335 eof = i_size_read(inode);
2338 retval = ll_generic_file_llseek_size(file, offset, origin,
2339 ll_file_maxbytes(inode), eof);
2343 int ll_flush(struct file *file, fl_owner_t id)
2345 struct inode *inode = file->f_dentry->d_inode;
2346 struct ll_inode_info *lli = ll_i2info(inode);
2347 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2350 LASSERT(!S_ISDIR(inode->i_mode));
2352 /* catch async errors that were recorded back when async writeback
2353 * failed for pages in this mapping. */
2354 rc = lli->lli_async_rc;
2355 lli->lli_async_rc = 0;
2356 err = lov_read_and_clear_async_rc(lli->lli_clob);
2360 /* The application has been told write failure already.
2361 * Do not report failure again. */
2362 if (fd->fd_write_failed)
2364 return rc ? -EIO : 0;
2368 * Called to make sure a portion of file has been written out.
2369 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2371 * Return how many pages have been written.
2373 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2374 enum cl_fsync_mode mode, int ignore_layout)
2376 struct cl_env_nest nest;
2379 struct obd_capa *capa = NULL;
2380 struct cl_fsync_io *fio;
2384 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2385 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2388 env = cl_env_nested_get(&nest);
2390 RETURN(PTR_ERR(env));
2392 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2394 io = ccc_env_thread_io(env);
2395 io->ci_obj = cl_i2info(inode)->lli_clob;
2396 io->ci_ignore_layout = ignore_layout;
2398 /* initialize parameters for sync */
2399 fio = &io->u.ci_fsync;
2400 fio->fi_capa = capa;
2401 fio->fi_start = start;
2403 fio->fi_fid = ll_inode2fid(inode);
2404 fio->fi_mode = mode;
2405 fio->fi_nr_written = 0;
2407 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2408 result = cl_io_loop(env, io);
2410 result = io->ci_result;
2412 result = fio->fi_nr_written;
2413 cl_io_fini(env, io);
2414 cl_env_nested_put(&nest, env);
2422 * When dentry is provided (the 'else' case), *file->f_dentry may be
2423 * null and dentry must be used directly rather than pulled from
2424 * *file->f_dentry as is done otherwise.
2427 #ifdef HAVE_FILE_FSYNC_4ARGS
2428 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2430 struct dentry *dentry = file->f_dentry;
2431 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2432 int ll_fsync(struct file *file, int datasync)
2434 struct dentry *dentry = file->f_dentry;
2436 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2439 struct inode *inode = dentry->d_inode;
2440 struct ll_inode_info *lli = ll_i2info(inode);
2441 struct ptlrpc_request *req;
2442 struct obd_capa *oc;
2446 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2447 inode->i_generation, inode);
2448 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2450 #ifdef HAVE_FILE_FSYNC_4ARGS
2451 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2452 mutex_lock(&inode->i_mutex);
2454 /* fsync's caller has already called _fdata{sync,write}, we want
2455 * that IO to finish before calling the osc and mdc sync methods */
2456 rc = filemap_fdatawait(inode->i_mapping);
2459 /* catch async errors that were recorded back when async writeback
2460 * failed for pages in this mapping. */
2461 if (!S_ISDIR(inode->i_mode)) {
2462 err = lli->lli_async_rc;
2463 lli->lli_async_rc = 0;
2466 err = lov_read_and_clear_async_rc(lli->lli_clob);
2471 oc = ll_mdscapa_get(inode);
2472 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2478 ptlrpc_req_finished(req);
2480 if (datasync && S_ISREG(inode->i_mode)) {
2481 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2483 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2485 if (rc == 0 && err < 0)
2488 fd->fd_write_failed = true;
2490 fd->fd_write_failed = false;
2493 #ifdef HAVE_FILE_FSYNC_4ARGS
2494 mutex_unlock(&inode->i_mutex);
2499 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2501 struct inode *inode = file->f_dentry->d_inode;
2502 struct ll_sb_info *sbi = ll_i2sbi(inode);
2503 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2504 .ei_cb_cp =ldlm_flock_completion_ast,
2505 .ei_cbdata = file_lock };
2506 struct md_op_data *op_data;
2507 struct lustre_handle lockh = {0};
2508 ldlm_policy_data_t flock = {{0}};
2514 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2515 inode->i_ino, file_lock);
2517 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2519 if (file_lock->fl_flags & FL_FLOCK) {
2520 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2521 /* flocks are whole-file locks */
2522 flock.l_flock.end = OFFSET_MAX;
2523 /* For flocks owner is determined by the local file desctiptor*/
2524 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2525 } else if (file_lock->fl_flags & FL_POSIX) {
2526 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2527 flock.l_flock.start = file_lock->fl_start;
2528 flock.l_flock.end = file_lock->fl_end;
2532 flock.l_flock.pid = file_lock->fl_pid;
2534 /* Somewhat ugly workaround for svc lockd.
2535 * lockd installs custom fl_lmops->lm_compare_owner that checks
2536 * for the fl_owner to be the same (which it always is on local node
2537 * I guess between lockd processes) and then compares pid.
2538 * As such we assign pid to the owner field to make it all work,
2539 * conflict with normal locks is unlikely since pid space and
2540 * pointer space for current->files are not intersecting */
2541 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2542 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2544 switch (file_lock->fl_type) {
2546 einfo.ei_mode = LCK_PR;
2549 /* An unlock request may or may not have any relation to
2550 * existing locks so we may not be able to pass a lock handle
2551 * via a normal ldlm_lock_cancel() request. The request may even
2552 * unlock a byte range in the middle of an existing lock. In
2553 * order to process an unlock request we need all of the same
2554 * information that is given with a normal read or write record
2555 * lock request. To avoid creating another ldlm unlock (cancel)
2556 * message we'll treat a LCK_NL flock request as an unlock. */
2557 einfo.ei_mode = LCK_NL;
2560 einfo.ei_mode = LCK_PW;
2563 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2564 file_lock->fl_type);
2579 flags = LDLM_FL_BLOCK_NOWAIT;
2585 flags = LDLM_FL_TEST_LOCK;
2586 /* Save the old mode so that if the mode in the lock changes we
2587 * can decrement the appropriate reader or writer refcount. */
2588 file_lock->fl_type = einfo.ei_mode;
2591 CERROR("unknown fcntl lock command: %d\n", cmd);
2595 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2596 LUSTRE_OPC_ANY, NULL);
2597 if (IS_ERR(op_data))
2598 RETURN(PTR_ERR(op_data));
2600 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2601 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2602 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2604 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2605 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2607 if ((file_lock->fl_flags & FL_FLOCK) &&
2608 (rc == 0 || file_lock->fl_type == F_UNLCK))
2609 rc2 = flock_lock_file_wait(file, file_lock);
2610 if ((file_lock->fl_flags & FL_POSIX) &&
2611 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2612 !(flags & LDLM_FL_TEST_LOCK))
2613 rc2 = posix_lock_file_wait(file, file_lock);
2615 if (rc2 && file_lock->fl_type != F_UNLCK) {
2616 einfo.ei_mode = LCK_NL;
2617 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2618 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2622 ll_finish_md_op_data(op_data);
2627 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2635 * test if some locks matching bits and l_req_mode are acquired
2636 * - bits can be in different locks
2637 * - if found clear the common lock bits in *bits
2638 * - the bits not found, are kept in *bits
2640 * \param bits [IN] searched lock bits [IN]
2641 * \param l_req_mode [IN] searched lock mode
2642 * \retval boolean, true iff all bits are found
2644 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2646 struct lustre_handle lockh;
2647 ldlm_policy_data_t policy;
2648 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2649 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2658 fid = &ll_i2info(inode)->lli_fid;
2659 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2660 ldlm_lockname[mode]);
2662 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2663 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2664 policy.l_inodebits.bits = *bits & (1 << i);
2665 if (policy.l_inodebits.bits == 0)
2668 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2669 &policy, mode, &lockh)) {
2670 struct ldlm_lock *lock;
2672 lock = ldlm_handle2lock(&lockh);
2675 ~(lock->l_policy_data.l_inodebits.bits);
2676 LDLM_LOCK_PUT(lock);
2678 *bits &= ~policy.l_inodebits.bits;
2685 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2686 struct lustre_handle *lockh, __u64 flags)
2688 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2693 fid = &ll_i2info(inode)->lli_fid;
2694 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2696 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2697 fid, LDLM_IBITS, &policy,
2698 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2702 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2704 /* Already unlinked. Just update nlink and return success */
2705 if (rc == -ENOENT) {
2707 /* This path cannot be hit for regular files unless in
2708 * case of obscure races, so no need to to validate
2710 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2712 } else if (rc != 0) {
2713 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2714 ll_get_fsname(inode->i_sb, NULL, 0),
2715 PFID(ll_inode2fid(inode)), rc);
2721 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2724 struct inode *inode = dentry->d_inode;
2725 struct ptlrpc_request *req = NULL;
2726 struct obd_export *exp;
2730 LASSERT(inode != NULL);
2732 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2733 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2735 exp = ll_i2mdexp(inode);
2737 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2738 * But under CMD case, it caused some lock issues, should be fixed
2739 * with new CMD ibits lock. See bug 12718 */
2740 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2741 struct lookup_intent oit = { .it_op = IT_GETATTR };
2742 struct md_op_data *op_data;
2744 if (ibits == MDS_INODELOCK_LOOKUP)
2745 oit.it_op = IT_LOOKUP;
2747 /* Call getattr by fid, so do not provide name at all. */
2748 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2749 dentry->d_inode, NULL, 0, 0,
2750 LUSTRE_OPC_ANY, NULL);
2751 if (IS_ERR(op_data))
2752 RETURN(PTR_ERR(op_data));
2754 oit.it_create_mode |= M_CHECK_STALE;
2755 rc = md_intent_lock(exp, op_data, NULL, 0,
2756 /* we are not interested in name
2759 ll_md_blocking_ast, 0);
2760 ll_finish_md_op_data(op_data);
2761 oit.it_create_mode &= ~M_CHECK_STALE;
2763 rc = ll_inode_revalidate_fini(inode, rc);
2767 rc = ll_revalidate_it_finish(req, &oit, dentry);
2769 ll_intent_release(&oit);
2773 /* Unlinked? Unhash dentry, so it is not picked up later by
2774 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2775 here to preserve get_cwd functionality on 2.6.
2777 if (!dentry->d_inode->i_nlink)
2778 d_lustre_invalidate(dentry, 0);
2780 ll_lookup_finish_locks(&oit, dentry);
2781 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2782 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2783 obd_valid valid = OBD_MD_FLGETATTR;
2784 struct md_op_data *op_data;
2787 if (S_ISREG(inode->i_mode)) {
2788 rc = ll_get_max_mdsize(sbi, &ealen);
2791 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2794 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2795 0, ealen, LUSTRE_OPC_ANY,
2797 if (IS_ERR(op_data))
2798 RETURN(PTR_ERR(op_data));
2800 op_data->op_valid = valid;
2801 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2802 * capa for this inode. Because we only keep capas of dirs
2804 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2805 ll_finish_md_op_data(op_data);
2807 rc = ll_inode_revalidate_fini(inode, rc);
2811 rc = ll_prep_inode(&inode, req, NULL, NULL);
2814 ptlrpc_req_finished(req);
2818 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2821 struct inode *inode = dentry->d_inode;
2825 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2829 /* if object isn't regular file, don't validate size */
2830 if (!S_ISREG(inode->i_mode)) {
2831 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2832 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2833 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2835 rc = ll_glimpse_size(inode);
2840 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2841 struct lookup_intent *it, struct kstat *stat)
2843 struct inode *inode = de->d_inode;
2844 struct ll_sb_info *sbi = ll_i2sbi(inode);
2845 struct ll_inode_info *lli = ll_i2info(inode);
2848 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2849 MDS_INODELOCK_LOOKUP);
2850 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2855 stat->dev = inode->i_sb->s_dev;
2856 if (ll_need_32bit_api(sbi))
2857 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2859 stat->ino = inode->i_ino;
2860 stat->mode = inode->i_mode;
2861 stat->nlink = inode->i_nlink;
2862 stat->uid = inode->i_uid;
2863 stat->gid = inode->i_gid;
2864 stat->rdev = inode->i_rdev;
2865 stat->atime = inode->i_atime;
2866 stat->mtime = inode->i_mtime;
2867 stat->ctime = inode->i_ctime;
2868 stat->blksize = 1 << inode->i_blkbits;
2870 stat->size = i_size_read(inode);
2871 stat->blocks = inode->i_blocks;
2875 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2877 struct lookup_intent it = { .it_op = IT_GETATTR };
2879 return ll_getattr_it(mnt, de, &it, stat);
2882 #ifdef HAVE_LINUX_FIEMAP_H
2883 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2884 __u64 start, __u64 len)
2888 struct ll_user_fiemap *fiemap;
2889 unsigned int extent_count = fieinfo->fi_extents_max;
2891 num_bytes = sizeof(*fiemap) + (extent_count *
2892 sizeof(struct ll_fiemap_extent));
2893 OBD_ALLOC_LARGE(fiemap, num_bytes);
2898 fiemap->fm_flags = fieinfo->fi_flags;
2899 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2900 fiemap->fm_start = start;
2901 fiemap->fm_length = len;
2902 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2903 sizeof(struct ll_fiemap_extent));
2905 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2907 fieinfo->fi_flags = fiemap->fm_flags;
2908 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2909 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2910 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2912 OBD_FREE_LARGE(fiemap, num_bytes);
2917 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2919 struct ll_inode_info *lli = ll_i2info(inode);
2920 struct posix_acl *acl = NULL;
2923 spin_lock(&lli->lli_lock);
2924 /* VFS' acl_permission_check->check_acl will release the refcount */
2925 acl = posix_acl_dup(lli->lli_posix_acl);
2926 spin_unlock(&lli->lli_lock);
2931 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2933 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2934 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2936 ll_check_acl(struct inode *inode, int mask)
2939 # ifdef CONFIG_FS_POSIX_ACL
2940 struct posix_acl *acl;
2944 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2945 if (flags & IPERM_FLAG_RCU)
2948 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2953 rc = posix_acl_permission(inode, acl, mask);
2954 posix_acl_release(acl);
2957 # else /* !CONFIG_FS_POSIX_ACL */
2959 # endif /* CONFIG_FS_POSIX_ACL */
2961 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2963 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2964 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2966 # ifdef HAVE_INODE_PERMISION_2ARGS
2967 int ll_inode_permission(struct inode *inode, int mask)
2969 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2976 #ifdef MAY_NOT_BLOCK
2977 if (mask & MAY_NOT_BLOCK)
2979 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2980 if (flags & IPERM_FLAG_RCU)
2984 /* as root inode are NOT getting validated in lookup operation,
2985 * need to do it before permission check. */
2987 if (inode == inode->i_sb->s_root->d_inode) {
2988 struct lookup_intent it = { .it_op = IT_LOOKUP };
2990 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2991 MDS_INODELOCK_LOOKUP);
2996 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2997 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2999 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3000 return lustre_check_remote_perm(inode, mask);
3002 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3003 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3008 #ifdef HAVE_FILE_READV
3009 #define READ_METHOD readv
3010 #define READ_FUNCTION ll_file_readv
3011 #define WRITE_METHOD writev
3012 #define WRITE_FUNCTION ll_file_writev
3014 #define READ_METHOD aio_read
3015 #define READ_FUNCTION ll_file_aio_read
3016 #define WRITE_METHOD aio_write
3017 #define WRITE_FUNCTION ll_file_aio_write
3020 /* -o localflock - only provides locally consistent flock locks */
3021 struct file_operations ll_file_operations = {
3022 .read = ll_file_read,
3023 .READ_METHOD = READ_FUNCTION,
3024 .write = ll_file_write,
3025 .WRITE_METHOD = WRITE_FUNCTION,
3026 .unlocked_ioctl = ll_file_ioctl,
3027 .open = ll_file_open,
3028 .release = ll_file_release,
3029 .mmap = ll_file_mmap,
3030 .llseek = ll_file_seek,
3031 #ifdef HAVE_KERNEL_SENDFILE
3032 .sendfile = ll_file_sendfile,
3034 #ifdef HAVE_KERNEL_SPLICE_READ
3035 .splice_read = ll_file_splice_read,
3041 struct file_operations ll_file_operations_flock = {
3042 .read = ll_file_read,
3043 .READ_METHOD = READ_FUNCTION,
3044 .write = ll_file_write,
3045 .WRITE_METHOD = WRITE_FUNCTION,
3046 .unlocked_ioctl = ll_file_ioctl,
3047 .open = ll_file_open,
3048 .release = ll_file_release,
3049 .mmap = ll_file_mmap,
3050 .llseek = ll_file_seek,
3051 #ifdef HAVE_KERNEL_SENDFILE
3052 .sendfile = ll_file_sendfile,
3054 #ifdef HAVE_KERNEL_SPLICE_READ
3055 .splice_read = ll_file_splice_read,
3059 .flock = ll_file_flock,
3060 .lock = ll_file_flock
3063 /* These are for -o noflock - to return ENOSYS on flock calls */
3064 struct file_operations ll_file_operations_noflock = {
3065 .read = ll_file_read,
3066 .READ_METHOD = READ_FUNCTION,
3067 .write = ll_file_write,
3068 .WRITE_METHOD = WRITE_FUNCTION,
3069 .unlocked_ioctl = ll_file_ioctl,
3070 .open = ll_file_open,
3071 .release = ll_file_release,
3072 .mmap = ll_file_mmap,
3073 .llseek = ll_file_seek,
3074 #ifdef HAVE_KERNEL_SENDFILE
3075 .sendfile = ll_file_sendfile,
3077 #ifdef HAVE_KERNEL_SPLICE_READ
3078 .splice_read = ll_file_splice_read,
3082 .flock = ll_file_noflock,
3083 .lock = ll_file_noflock
3086 struct inode_operations ll_file_inode_operations = {
3087 .setattr = ll_setattr,
3088 .getattr = ll_getattr,
3089 .permission = ll_inode_permission,
3090 .setxattr = ll_setxattr,
3091 .getxattr = ll_getxattr,
3092 .listxattr = ll_listxattr,
3093 .removexattr = ll_removexattr,
3094 #ifdef HAVE_LINUX_FIEMAP_H
3095 .fiemap = ll_fiemap,
3097 #ifdef HAVE_IOP_GET_ACL
3098 .get_acl = ll_get_acl,
3102 /* dynamic ioctl number support routins */
3103 static struct llioc_ctl_data {
3104 struct rw_semaphore ioc_sem;
3105 cfs_list_t ioc_head;
3107 __RWSEM_INITIALIZER(llioc.ioc_sem),
3108 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3113 cfs_list_t iocd_list;
3114 unsigned int iocd_size;
3115 llioc_callback_t iocd_cb;
3116 unsigned int iocd_count;
3117 unsigned int iocd_cmd[0];
3120 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3123 struct llioc_data *in_data = NULL;
3126 if (cb == NULL || cmd == NULL ||
3127 count > LLIOC_MAX_CMD || count < 0)
3130 size = sizeof(*in_data) + count * sizeof(unsigned int);
3131 OBD_ALLOC(in_data, size);
3132 if (in_data == NULL)
3135 memset(in_data, 0, sizeof(*in_data));
3136 in_data->iocd_size = size;
3137 in_data->iocd_cb = cb;
3138 in_data->iocd_count = count;
3139 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3141 down_write(&llioc.ioc_sem);
3142 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3143 up_write(&llioc.ioc_sem);
3148 void ll_iocontrol_unregister(void *magic)
3150 struct llioc_data *tmp;
3155 down_write(&llioc.ioc_sem);
3156 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3158 unsigned int size = tmp->iocd_size;
3160 cfs_list_del(&tmp->iocd_list);
3161 up_write(&llioc.ioc_sem);
3163 OBD_FREE(tmp, size);
3167 up_write(&llioc.ioc_sem);
3169 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3172 EXPORT_SYMBOL(ll_iocontrol_register);
3173 EXPORT_SYMBOL(ll_iocontrol_unregister);
3175 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3176 unsigned int cmd, unsigned long arg, int *rcp)
3178 enum llioc_iter ret = LLIOC_CONT;
3179 struct llioc_data *data;
3180 int rc = -EINVAL, i;
3182 down_read(&llioc.ioc_sem);
3183 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3184 for (i = 0; i < data->iocd_count; i++) {
3185 if (cmd != data->iocd_cmd[i])
3188 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3192 if (ret == LLIOC_STOP)
3195 up_read(&llioc.ioc_sem);
3202 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3204 struct ll_inode_info *lli = ll_i2info(inode);
3205 struct cl_env_nest nest;
3210 if (lli->lli_clob == NULL)
3213 env = cl_env_nested_get(&nest);
3215 RETURN(PTR_ERR(env));
3217 result = cl_conf_set(env, lli->lli_clob, conf);
3218 cl_env_nested_put(&nest, env);
3220 if (conf->coc_opc == OBJECT_CONF_SET) {
3221 struct ldlm_lock *lock = conf->coc_lock;
3223 LASSERT(lock != NULL);
3224 LASSERT(ldlm_has_layout(lock));
3226 /* it can only be allowed to match after layout is
3227 * applied to inode otherwise false layout would be
3228 * seen. Applying layout shoud happen before dropping
3229 * the intent lock. */
3230 ldlm_lock_allow_match(lock);
3236 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3237 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3240 struct ll_sb_info *sbi = ll_i2sbi(inode);
3241 struct obd_capa *oc;
3242 struct ptlrpc_request *req;
3243 struct mdt_body *body;
3250 if (lock->l_lvb_data != NULL)
3253 /* if layout lock was granted right away, the layout is returned
3254 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3255 * blocked and then granted via completion ast, we have to fetch
3256 * layout here. Please note that we can't use the LVB buffer in
3257 * completion AST because it doesn't have a large enough buffer */
3258 oc = ll_mdscapa_get(inode);
3259 rc = ll_get_max_mdsize(sbi, &lmmsize);
3261 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3262 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3268 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3269 if (body == NULL || body->eadatasize > lmmsize)
3270 GOTO(out, rc = -EPROTO);
3272 lmmsize = body->eadatasize;
3273 if (lmmsize == 0) /* empty layout */
3276 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3278 GOTO(out, rc = -EFAULT);
3280 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3281 if (lvbdata == NULL)
3282 GOTO(out, rc = -ENOMEM);
3284 memcpy(lvbdata, lmm, lmmsize);
3285 lock_res_and_lock(lock);
3286 if (lock->l_lvb_data == NULL) {
3287 lock->l_lvb_data = lvbdata;
3288 lock->l_lvb_len = lmmsize;
3291 unlock_res_and_lock(lock);
3293 if (lvbdata != NULL)
3294 OBD_FREE_LARGE(lvbdata, lmmsize);
3298 ptlrpc_req_finished(req);
3303 * Apply the layout to the inode. Layout lock is held and will be released
3306 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3307 struct inode *inode, __u32 *gen, bool reconf)
3309 struct ll_inode_info *lli = ll_i2info(inode);
3310 struct ll_sb_info *sbi = ll_i2sbi(inode);
3311 struct ldlm_lock *lock;
3312 struct lustre_md md = { NULL };
3313 struct cl_object_conf conf;
3316 bool wait_layout = false;
3319 LASSERT(lustre_handle_is_used(lockh));
3321 lock = ldlm_handle2lock(lockh);
3322 LASSERT(lock != NULL);
3323 LASSERT(ldlm_has_layout(lock));
3325 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3326 inode, PFID(&lli->lli_fid), reconf);
3328 /* in case this is a caching lock and reinstate with new inode */
3329 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3331 lock_res_and_lock(lock);
3332 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3333 unlock_res_and_lock(lock);
3334 /* checking lvb_ready is racy but this is okay. The worst case is
3335 * that multi processes may configure the file on the same time. */
3336 if (lvb_ready || !reconf) {
3339 /* layout_gen must be valid if layout lock is not
3340 * cancelled and stripe has already set */
3341 *gen = lli->lli_layout_gen;
3347 rc = ll_layout_fetch(inode, lock);
3351 /* for layout lock, lmm is returned in lock's lvb.
3352 * lvb_data is immutable if the lock is held so it's safe to access it
3353 * without res lock. See the description in ldlm_lock_decref_internal()
3354 * for the condition to free lvb_data of layout lock */
3355 if (lock->l_lvb_data != NULL) {
3356 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3357 lock->l_lvb_data, lock->l_lvb_len);
3359 *gen = LL_LAYOUT_GEN_EMPTY;
3361 *gen = md.lsm->lsm_layout_gen;
3364 CERROR("%s: file "DFID" unpackmd error: %d\n",
3365 ll_get_fsname(inode->i_sb, NULL, 0),
3366 PFID(&lli->lli_fid), rc);
3372 /* set layout to file. Unlikely this will fail as old layout was
3373 * surely eliminated */
3374 memset(&conf, 0, sizeof conf);
3375 conf.coc_opc = OBJECT_CONF_SET;
3376 conf.coc_inode = inode;
3377 conf.coc_lock = lock;
3378 conf.u.coc_md = &md;
3379 rc = ll_layout_conf(inode, &conf);
3382 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3384 /* refresh layout failed, need to wait */
3385 wait_layout = rc == -EBUSY;
3389 LDLM_LOCK_PUT(lock);
3390 ldlm_lock_decref(lockh, mode);
3392 /* wait for IO to complete if it's still being used. */
3394 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3395 ll_get_fsname(inode->i_sb, NULL, 0),
3396 inode, PFID(&lli->lli_fid));
3398 memset(&conf, 0, sizeof conf);
3399 conf.coc_opc = OBJECT_CONF_WAIT;
3400 conf.coc_inode = inode;
3401 rc = ll_layout_conf(inode, &conf);
3405 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3406 PFID(&lli->lli_fid), rc);
3412 * This function checks if there exists a LAYOUT lock on the client side,
3413 * or enqueues it if it doesn't have one in cache.
3415 * This function will not hold layout lock so it may be revoked any time after
3416 * this function returns. Any operations depend on layout should be redone
3419 * This function should be called before lov_io_init() to get an uptodate
3420 * layout version, the caller should save the version number and after IO
3421 * is finished, this function should be called again to verify that layout
3422 * is not changed during IO time.
3424 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3426 struct ll_inode_info *lli = ll_i2info(inode);
3427 struct ll_sb_info *sbi = ll_i2sbi(inode);
3428 struct md_op_data *op_data;
3429 struct lookup_intent it;
3430 struct lustre_handle lockh;
3432 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3434 .ei_cb_bl = ll_md_blocking_ast,
3435 .ei_cb_cp = ldlm_completion_ast,
3436 .ei_cbdata = NULL };
3440 *gen = lli->lli_layout_gen;
3441 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3445 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3446 LASSERT(S_ISREG(inode->i_mode));
3448 /* mostly layout lock is caching on the local side, so try to match
3449 * it before grabbing layout lock mutex. */
3450 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3451 if (mode != 0) { /* hit cached lock */
3452 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3456 /* better hold lli_layout_mutex to try again otherwise
3457 * it will have starvation problem. */
3460 /* take layout lock mutex to enqueue layout lock exclusively. */
3461 mutex_lock(&lli->lli_layout_mutex);
3464 /* try again. Maybe somebody else has done this. */
3465 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3466 if (mode != 0) { /* hit cached lock */
3467 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3471 mutex_unlock(&lli->lli_layout_mutex);
3475 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3476 0, 0, LUSTRE_OPC_ANY, NULL);
3477 if (IS_ERR(op_data)) {
3478 mutex_unlock(&lli->lli_layout_mutex);
3479 RETURN(PTR_ERR(op_data));
3482 /* have to enqueue one */
3483 memset(&it, 0, sizeof(it));
3484 it.it_op = IT_LAYOUT;
3485 lockh.cookie = 0ULL;
3487 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3488 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3489 PFID(&lli->lli_fid));
3491 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3493 if (it.d.lustre.it_data != NULL)
3494 ptlrpc_req_finished(it.d.lustre.it_data);
3495 it.d.lustre.it_data = NULL;
3497 ll_finish_md_op_data(op_data);
3499 mode = it.d.lustre.it_lock_mode;
3500 it.d.lustre.it_lock_mode = 0;
3501 ll_intent_drop_lock(&it);
3504 /* set lock data in case this is a new lock */
3505 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3506 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3510 mutex_unlock(&lli->lli_layout_mutex);