4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
61 fd->fd_write_failed = false;
66 static void ll_file_data_put(struct ll_file_data *fd)
69 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
73 struct lustre_handle *fh)
75 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
76 op_data->op_attr.ia_mode = inode->i_mode;
77 op_data->op_attr.ia_atime = inode->i_atime;
78 op_data->op_attr.ia_mtime = inode->i_mtime;
79 op_data->op_attr.ia_ctime = inode->i_ctime;
80 op_data->op_attr.ia_size = i_size_read(inode);
81 op_data->op_attr_blocks = inode->i_blocks;
82 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
83 ll_inode_to_ext_flags(inode->i_flags);
84 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
86 op_data->op_handle = *fh;
87 op_data->op_capa1 = ll_mdscapa_get(inode);
89 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
90 op_data->op_bias |= MDS_DATA_MODIFIED;
94 * Closes the IO epoch and packs all the attributes into @op_data for
97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
98 struct obd_client_handle *och)
102 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
104 ATTR_CTIME | ATTR_CTIME_SET;
106 if (!(och->och_flags & FMODE_WRITE))
109 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
110 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
112 ll_ioepoch_close(inode, op_data, &och, 0);
115 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
116 ll_prep_md_op_data(op_data, inode, NULL, NULL,
117 0, 0, LUSTRE_OPC_ANY, NULL);
121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
123 struct obd_client_handle *och)
125 struct obd_export *exp = ll_i2mdexp(inode);
126 struct md_op_data *op_data;
127 struct ptlrpc_request *req = NULL;
128 struct obd_device *obd = class_exp2obd(exp);
135 * XXX: in case of LMV, is this correct to access
138 CERROR("Invalid MDC connection handle "LPX64"\n",
139 ll_i2mdexp(inode)->exp_handle.h_cookie);
143 OBD_ALLOC_PTR(op_data);
145 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
147 ll_prepare_close(inode, op_data, och);
148 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
149 rc = md_close(md_exp, op_data, och->och_mod, &req);
151 /* This close must have the epoch closed. */
152 LASSERT(epoch_close);
153 /* MDS has instructed us to obtain Size-on-MDS attribute from
154 * OSTs and send setattr to back to MDS. */
155 rc = ll_som_update(inode, op_data);
157 CERROR("inode %lu mdc Size-on-MDS update failed: "
158 "rc = %d\n", inode->i_ino, rc);
162 CERROR("inode %lu mdc close failed: rc = %d\n",
166 /* DATA_MODIFIED flag was successfully sent on close, cancel data
167 * modification flag. */
168 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
169 struct ll_inode_info *lli = ll_i2info(inode);
171 spin_lock(&lli->lli_lock);
172 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
173 spin_unlock(&lli->lli_lock);
176 ll_finish_md_op_data(op_data);
179 rc = ll_objects_destroy(req, inode);
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
188 if (exp_connect_som(exp) && !epoch_close &&
189 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
190 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
192 md_clear_open_replay_data(md_exp, och);
193 /* Free @och if it is not waiting for DONE_WRITING. */
194 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
197 if (req) /* This is close request */
198 ptlrpc_req_finished(req);
202 int ll_md_real_close(struct inode *inode, int flags)
204 struct ll_inode_info *lli = ll_i2info(inode);
205 struct obd_client_handle **och_p;
206 struct obd_client_handle *och;
211 if (flags & FMODE_WRITE) {
212 och_p = &lli->lli_mds_write_och;
213 och_usecount = &lli->lli_open_fd_write_count;
214 } else if (flags & FMODE_EXEC) {
215 och_p = &lli->lli_mds_exec_och;
216 och_usecount = &lli->lli_open_fd_exec_count;
218 LASSERT(flags & FMODE_READ);
219 och_p = &lli->lli_mds_read_och;
220 och_usecount = &lli->lli_open_fd_read_count;
223 mutex_lock(&lli->lli_och_mutex);
224 if (*och_usecount) { /* There are still users of this handle, so
226 mutex_unlock(&lli->lli_och_mutex);
231 mutex_unlock(&lli->lli_och_mutex);
233 if (och) { /* There might be a race and somebody have freed this och
235 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
242 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
245 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
246 struct ll_inode_info *lli = ll_i2info(inode);
250 /* clear group lock, if present */
251 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
252 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
254 /* Let's see if we have good enough OPEN lock on the file and if
255 we can skip talking to MDS */
256 if (file->f_dentry->d_inode) { /* Can this ever be false? */
258 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
259 struct lustre_handle lockh;
260 struct inode *inode = file->f_dentry->d_inode;
261 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
263 mutex_lock(&lli->lli_och_mutex);
264 if (fd->fd_omode & FMODE_WRITE) {
266 LASSERT(lli->lli_open_fd_write_count);
267 lli->lli_open_fd_write_count--;
268 } else if (fd->fd_omode & FMODE_EXEC) {
270 LASSERT(lli->lli_open_fd_exec_count);
271 lli->lli_open_fd_exec_count--;
274 LASSERT(lli->lli_open_fd_read_count);
275 lli->lli_open_fd_read_count--;
277 mutex_unlock(&lli->lli_och_mutex);
279 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
280 LDLM_IBITS, &policy, lockmode,
282 rc = ll_md_real_close(file->f_dentry->d_inode,
286 CERROR("Releasing a file %p with negative dentry %p. Name %s",
287 file, file->f_dentry, file->f_dentry->d_name.name);
290 LUSTRE_FPRIVATE(file) = NULL;
291 ll_file_data_put(fd);
292 ll_capa_close(inode);
297 /* While this returns an error code, fput() the caller does not, so we need
298 * to make every effort to clean up all of our state here. Also, applications
299 * rarely check close errors and even if an error is returned they will not
300 * re-try the close call.
302 int ll_file_release(struct inode *inode, struct file *file)
304 struct ll_file_data *fd;
305 struct ll_sb_info *sbi = ll_i2sbi(inode);
306 struct ll_inode_info *lli = ll_i2info(inode);
310 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
311 inode->i_generation, inode);
313 #ifdef CONFIG_FS_POSIX_ACL
314 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
315 inode == inode->i_sb->s_root->d_inode) {
316 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
319 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
320 fd->fd_flags &= ~LL_FILE_RMTACL;
321 rct_del(&sbi->ll_rct, cfs_curproc_pid());
322 et_search_free(&sbi->ll_et, cfs_curproc_pid());
327 if (inode->i_sb->s_root != file->f_dentry)
328 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
329 fd = LUSTRE_FPRIVATE(file);
332 /* The last ref on @file, maybe not the the owner pid of statahead.
333 * Different processes can open the same dir, "ll_opendir_key" means:
334 * it is me that should stop the statahead thread. */
335 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
336 lli->lli_opendir_pid != 0)
337 ll_stop_statahead(inode, lli->lli_opendir_key);
339 if (inode->i_sb->s_root == file->f_dentry) {
340 LUSTRE_FPRIVATE(file) = NULL;
341 ll_file_data_put(fd);
345 if (!S_ISDIR(inode->i_mode)) {
346 lov_read_and_clear_async_rc(lli->lli_clob);
347 lli->lli_async_rc = 0;
350 rc = ll_md_close(sbi->ll_md_exp, inode, file);
352 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
353 libcfs_debug_dumplog();
358 static int ll_intent_file_open(struct file *file, void *lmm,
359 int lmmsize, struct lookup_intent *itp)
361 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
362 struct dentry *parent = file->f_dentry->d_parent;
363 const char *name = file->f_dentry->d_name.name;
364 const int len = file->f_dentry->d_name.len;
365 struct md_op_data *op_data;
366 struct ptlrpc_request *req;
367 __u32 opc = LUSTRE_OPC_ANY;
374 /* Usually we come here only for NFSD, and we want open lock.
375 But we can also get here with pre 2.6.15 patchless kernels, and in
376 that case that lock is also ok */
377 /* We can also get here if there was cached open handle in revalidate_it
378 * but it disappeared while we were getting from there to ll_file_open.
379 * But this means this file was closed and immediatelly opened which
380 * makes a good candidate for using OPEN lock */
381 /* If lmmsize & lmm are not 0, we are just setting stripe info
382 * parameters. No need for the open lock */
383 if (lmm == NULL && lmmsize == 0) {
384 itp->it_flags |= MDS_OPEN_LOCK;
385 if (itp->it_flags & FMODE_WRITE)
386 opc = LUSTRE_OPC_CREATE;
389 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
390 file->f_dentry->d_inode, name, len,
393 RETURN(PTR_ERR(op_data));
395 itp->it_flags |= MDS_OPEN_BY_FID;
396 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
397 0 /*unused */, &req, ll_md_blocking_ast, 0);
398 ll_finish_md_op_data(op_data);
400 /* reason for keep own exit path - don`t flood log
401 * with messages with -ESTALE errors.
403 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
404 it_open_error(DISP_OPEN_OPEN, itp))
406 ll_release_openhandle(file->f_dentry, itp);
410 if (it_disposition(itp, DISP_LOOKUP_NEG))
411 GOTO(out, rc = -ENOENT);
413 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
414 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
415 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
419 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
420 if (!rc && itp->d.lustre.it_lock_mode)
421 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
425 ptlrpc_req_finished(itp->d.lustre.it_data);
426 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
427 ll_intent_drop_lock(itp);
433 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
434 * not believe attributes if a few ioepoch holders exist. Attributes for
435 * previous ioepoch if new one is opened are also skipped by MDS.
437 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
439 if (ioepoch && lli->lli_ioepoch != ioepoch) {
440 lli->lli_ioepoch = ioepoch;
441 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
442 ioepoch, PFID(&lli->lli_fid));
446 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
447 struct lookup_intent *it, struct obd_client_handle *och)
449 struct ptlrpc_request *req = it->d.lustre.it_data;
450 struct mdt_body *body;
454 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
455 LASSERT(body != NULL); /* reply already checked out */
457 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
458 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
459 och->och_fid = lli->lli_fid;
460 och->och_flags = it->it_flags;
461 ll_ioepoch_open(lli, body->ioepoch);
463 return md_set_open_replay_data(md_exp, och, req);
466 int ll_local_open(struct file *file, struct lookup_intent *it,
467 struct ll_file_data *fd, struct obd_client_handle *och)
469 struct inode *inode = file->f_dentry->d_inode;
470 struct ll_inode_info *lli = ll_i2info(inode);
473 LASSERT(!LUSTRE_FPRIVATE(file));
478 struct ptlrpc_request *req = it->d.lustre.it_data;
479 struct mdt_body *body;
482 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
486 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
487 if ((it->it_flags & FMODE_WRITE) &&
488 (body->valid & OBD_MD_FLSIZE))
489 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
490 lli->lli_ioepoch, PFID(&lli->lli_fid));
493 LUSTRE_FPRIVATE(file) = fd;
494 ll_readahead_init(inode, &fd->fd_ras);
495 fd->fd_omode = it->it_flags;
499 /* Open a file, and (for the very first open) create objects on the OSTs at
500 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
501 * creation or open until ll_lov_setstripe() ioctl is called.
503 * If we already have the stripe MD locally then we don't request it in
504 * md_open(), by passing a lmm_size = 0.
506 * It is up to the application to ensure no other processes open this file
507 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
508 * used. We might be able to avoid races of that sort by getting lli_open_sem
509 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
510 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
512 int ll_file_open(struct inode *inode, struct file *file)
514 struct ll_inode_info *lli = ll_i2info(inode);
515 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
516 .it_flags = file->f_flags };
517 struct obd_client_handle **och_p = NULL;
518 __u64 *och_usecount = NULL;
519 struct ll_file_data *fd;
520 int rc = 0, opendir_set = 0;
523 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
524 inode->i_generation, inode, file->f_flags);
526 it = file->private_data; /* XXX: compat macro */
527 file->private_data = NULL; /* prevent ll_local_open assertion */
529 fd = ll_file_data_get();
531 GOTO(out_openerr, rc = -ENOMEM);
534 if (S_ISDIR(inode->i_mode)) {
535 spin_lock(&lli->lli_sa_lock);
536 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
537 lli->lli_opendir_pid == 0) {
538 lli->lli_opendir_key = fd;
539 lli->lli_opendir_pid = cfs_curproc_pid();
542 spin_unlock(&lli->lli_sa_lock);
545 if (inode->i_sb->s_root == file->f_dentry) {
546 LUSTRE_FPRIVATE(file) = fd;
550 if (!it || !it->d.lustre.it_disposition) {
551 /* Convert f_flags into access mode. We cannot use file->f_mode,
552 * because everything but O_ACCMODE mask was stripped from
554 if ((oit.it_flags + 1) & O_ACCMODE)
556 if (file->f_flags & O_TRUNC)
557 oit.it_flags |= FMODE_WRITE;
559 /* kernel only call f_op->open in dentry_open. filp_open calls
560 * dentry_open after call to open_namei that checks permissions.
561 * Only nfsd_open call dentry_open directly without checking
562 * permissions and because of that this code below is safe. */
563 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
564 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
566 /* We do not want O_EXCL here, presumably we opened the file
567 * already? XXX - NFS implications? */
568 oit.it_flags &= ~O_EXCL;
570 /* bug20584, if "it_flags" contains O_CREAT, the file will be
571 * created if necessary, then "IT_CREAT" should be set to keep
572 * consistent with it */
573 if (oit.it_flags & O_CREAT)
574 oit.it_op |= IT_CREAT;
580 /* Let's see if we have file open on MDS already. */
581 if (it->it_flags & FMODE_WRITE) {
582 och_p = &lli->lli_mds_write_och;
583 och_usecount = &lli->lli_open_fd_write_count;
584 } else if (it->it_flags & FMODE_EXEC) {
585 och_p = &lli->lli_mds_exec_och;
586 och_usecount = &lli->lli_open_fd_exec_count;
588 och_p = &lli->lli_mds_read_och;
589 och_usecount = &lli->lli_open_fd_read_count;
592 mutex_lock(&lli->lli_och_mutex);
593 if (*och_p) { /* Open handle is present */
594 if (it_disposition(it, DISP_OPEN_OPEN)) {
595 /* Well, there's extra open request that we do not need,
596 let's close it somehow. This will decref request. */
597 rc = it_open_error(DISP_OPEN_OPEN, it);
599 mutex_unlock(&lli->lli_och_mutex);
600 GOTO(out_openerr, rc);
603 ll_release_openhandle(file->f_dentry, it);
607 rc = ll_local_open(file, it, fd, NULL);
610 mutex_unlock(&lli->lli_och_mutex);
611 GOTO(out_openerr, rc);
614 LASSERT(*och_usecount == 0);
615 if (!it->d.lustre.it_disposition) {
616 /* We cannot just request lock handle now, new ELC code
617 means that one of other OPEN locks for this file
618 could be cancelled, and since blocking ast handler
619 would attempt to grab och_mutex as well, that would
620 result in a deadlock */
621 mutex_unlock(&lli->lli_och_mutex);
622 it->it_create_mode |= M_CHECK_STALE;
623 rc = ll_intent_file_open(file, NULL, 0, it);
624 it->it_create_mode &= ~M_CHECK_STALE;
626 GOTO(out_openerr, rc);
630 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
632 GOTO(out_och_free, rc = -ENOMEM);
636 /* md_intent_lock() didn't get a request ref if there was an
637 * open error, so don't do cleanup on the request here
639 /* XXX (green): Should not we bail out on any error here, not
640 * just open error? */
641 rc = it_open_error(DISP_OPEN_OPEN, it);
643 GOTO(out_och_free, rc);
645 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
647 rc = ll_local_open(file, it, fd, *och_p);
649 GOTO(out_och_free, rc);
651 mutex_unlock(&lli->lli_och_mutex);
654 /* Must do this outside lli_och_mutex lock to prevent deadlock where
655 different kind of OPEN lock for this same inode gets cancelled
656 by ldlm_cancel_lru */
657 if (!S_ISREG(inode->i_mode))
658 GOTO(out_och_free, rc);
662 if (!lli->lli_has_smd) {
663 if (file->f_flags & O_LOV_DELAY_CREATE ||
664 !(file->f_mode & FMODE_WRITE)) {
665 CDEBUG(D_INODE, "object creation was delayed\n");
666 GOTO(out_och_free, rc);
669 file->f_flags &= ~O_LOV_DELAY_CREATE;
670 GOTO(out_och_free, rc);
674 if (och_p && *och_p) {
675 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
676 *och_p = NULL; /* OBD_FREE writes some magic there */
679 mutex_unlock(&lli->lli_och_mutex);
682 if (opendir_set != 0)
683 ll_stop_statahead(inode, lli->lli_opendir_key);
685 ll_file_data_put(fd);
687 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
690 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
691 ptlrpc_req_finished(it->d.lustre.it_data);
692 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
698 /* Fills the obdo with the attributes for the lsm */
699 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
700 struct obd_capa *capa, struct obdo *obdo,
701 __u64 ioepoch, int sync)
703 struct ptlrpc_request_set *set;
704 struct obd_info oinfo = { { { 0 } } };
709 LASSERT(lsm != NULL);
713 oinfo.oi_oa->o_oi = lsm->lsm_oi;
714 oinfo.oi_oa->o_mode = S_IFREG;
715 oinfo.oi_oa->o_ioepoch = ioepoch;
716 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
717 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
718 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
719 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
720 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
721 OBD_MD_FLDATAVERSION;
722 oinfo.oi_capa = capa;
724 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
725 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
728 set = ptlrpc_prep_set();
730 CERROR("can't allocate ptlrpc set\n");
733 rc = obd_getattr_async(exp, &oinfo, set);
735 rc = ptlrpc_set_wait(set);
736 ptlrpc_set_destroy(set);
739 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
740 OBD_MD_FLATIME | OBD_MD_FLMTIME |
741 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
742 OBD_MD_FLDATAVERSION);
747 * Performs the getattr on the inode and updates its fields.
748 * If @sync != 0, perform the getattr under the server-side lock.
750 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
751 __u64 ioepoch, int sync)
753 struct obd_capa *capa = ll_mdscapa_get(inode);
754 struct lov_stripe_md *lsm;
758 lsm = ccc_inode_lsm_get(inode);
759 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
760 capa, obdo, ioepoch, sync);
763 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
765 obdo_refresh_inode(inode, obdo, obdo->o_valid);
766 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
767 " blksize %lu\n", POSTID(oi), i_size_read(inode),
768 (unsigned long long)inode->i_blocks,
769 (unsigned long)ll_inode_blksize(inode));
771 ccc_inode_lsm_put(inode, lsm);
775 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
777 struct ll_inode_info *lli = ll_i2info(inode);
778 struct cl_object *obj = lli->lli_clob;
779 struct cl_attr *attr = ccc_env_thread_attr(env);
785 ll_inode_size_lock(inode);
786 /* merge timestamps the most recently obtained from mds with
787 timestamps obtained from osts */
788 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
789 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
790 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
791 inode_init_lvb(inode, &lvb);
793 cl_object_attr_lock(obj);
794 rc = cl_object_attr_get(env, obj, attr);
795 cl_object_attr_unlock(obj);
798 if (lvb.lvb_atime < attr->cat_atime)
799 lvb.lvb_atime = attr->cat_atime;
800 if (lvb.lvb_ctime < attr->cat_ctime)
801 lvb.lvb_ctime = attr->cat_ctime;
802 if (lvb.lvb_mtime < attr->cat_mtime)
803 lvb.lvb_mtime = attr->cat_mtime;
805 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
806 PFID(&lli->lli_fid), attr->cat_size);
807 cl_isize_write_nolock(inode, attr->cat_size);
809 inode->i_blocks = attr->cat_blocks;
811 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
812 LTIME_S(inode->i_atime) = lvb.lvb_atime;
813 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
815 ll_inode_size_unlock(inode);
820 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
823 struct obdo obdo = { 0 };
826 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
828 st->st_size = obdo.o_size;
829 st->st_blocks = obdo.o_blocks;
830 st->st_mtime = obdo.o_mtime;
831 st->st_atime = obdo.o_atime;
832 st->st_ctime = obdo.o_ctime;
837 void ll_io_init(struct cl_io *io, const struct file *file, int write)
839 struct inode *inode = file->f_dentry->d_inode;
841 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
843 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
844 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
845 file->f_flags & O_DIRECT ||
848 io->ci_obj = ll_i2info(inode)->lli_clob;
849 io->ci_lockreq = CILR_MAYBE;
850 if (ll_file_nolock(file)) {
851 io->ci_lockreq = CILR_NEVER;
852 io->ci_no_srvlock = 1;
853 } else if (file->f_flags & O_APPEND) {
854 io->ci_lockreq = CILR_MANDATORY;
859 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
860 struct file *file, enum cl_io_type iot,
861 loff_t *ppos, size_t count)
863 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
864 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
870 io = ccc_env_thread_io(env);
871 ll_io_init(io, file, iot == CIT_WRITE);
873 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
874 struct vvp_io *vio = vvp_env_io(env);
875 struct ccc_io *cio = ccc_env_io(env);
876 int write_mutex_locked = 0;
878 cio->cui_fd = LUSTRE_FPRIVATE(file);
879 vio->cui_io_subtype = args->via_io_subtype;
881 switch (vio->cui_io_subtype) {
883 cio->cui_iov = args->u.normal.via_iov;
884 cio->cui_nrsegs = args->u.normal.via_nrsegs;
885 cio->cui_tot_nrsegs = cio->cui_nrsegs;
886 #ifndef HAVE_FILE_WRITEV
887 cio->cui_iocb = args->u.normal.via_iocb;
889 if ((iot == CIT_WRITE) &&
890 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
891 if (mutex_lock_interruptible(&lli->
893 GOTO(out, result = -ERESTARTSYS);
894 write_mutex_locked = 1;
895 } else if (iot == CIT_READ) {
896 down_read(&lli->lli_trunc_sem);
900 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
901 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
904 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
905 vio->u.splice.cui_flags = args->u.splice.via_flags;
908 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
911 result = cl_io_loop(env, io);
912 if (write_mutex_locked)
913 mutex_unlock(&lli->lli_write_mutex);
914 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
915 up_read(&lli->lli_trunc_sem);
917 /* cl_io_rw_init() handled IO */
918 result = io->ci_result;
921 if (io->ci_nob > 0) {
923 *ppos = io->u.ci_wr.wr.crw_pos;
928 /* If any bit been read/written (result != 0), we just return
929 * short read/write instead of restart io. */
930 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
931 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
932 iot == CIT_READ ? "read" : "write",
933 file->f_dentry->d_name.name, *ppos, count);
934 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
938 if (iot == CIT_READ) {
940 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
941 LPROC_LL_READ_BYTES, result);
942 } else if (iot == CIT_WRITE) {
944 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
945 LPROC_LL_WRITE_BYTES, result);
946 fd->fd_write_failed = false;
947 } else if (result != -ERESTARTSYS) {
948 fd->fd_write_failed = true;
957 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
959 static int ll_file_get_iov_count(const struct iovec *iov,
960 unsigned long *nr_segs, size_t *count)
965 for (seg = 0; seg < *nr_segs; seg++) {
966 const struct iovec *iv = &iov[seg];
969 * If any segment has a negative length, or the cumulative
970 * length ever wraps negative then return -EINVAL.
973 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
975 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
980 cnt -= iv->iov_len; /* This segment is no good */
987 #ifdef HAVE_FILE_READV
988 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
989 unsigned long nr_segs, loff_t *ppos)
992 struct vvp_io_args *args;
998 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1002 env = cl_env_get(&refcheck);
1004 RETURN(PTR_ERR(env));
1006 args = vvp_env_args(env, IO_NORMAL);
1007 args->u.normal.via_iov = (struct iovec *)iov;
1008 args->u.normal.via_nrsegs = nr_segs;
1010 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
1011 cl_env_put(env, &refcheck);
1015 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1019 struct iovec *local_iov;
1024 env = cl_env_get(&refcheck);
1026 RETURN(PTR_ERR(env));
1028 local_iov = &vvp_env_info(env)->vti_local_iov;
1029 local_iov->iov_base = (void __user *)buf;
1030 local_iov->iov_len = count;
1031 result = ll_file_readv(file, local_iov, 1, ppos);
1032 cl_env_put(env, &refcheck);
1037 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1038 unsigned long nr_segs, loff_t pos)
1041 struct vvp_io_args *args;
1047 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1051 env = cl_env_get(&refcheck);
1053 RETURN(PTR_ERR(env));
1055 args = vvp_env_args(env, IO_NORMAL);
1056 args->u.normal.via_iov = (struct iovec *)iov;
1057 args->u.normal.via_nrsegs = nr_segs;
1058 args->u.normal.via_iocb = iocb;
1060 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1061 &iocb->ki_pos, count);
1062 cl_env_put(env, &refcheck);
1066 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1070 struct iovec *local_iov;
1071 struct kiocb *kiocb;
1076 env = cl_env_get(&refcheck);
1078 RETURN(PTR_ERR(env));
1080 local_iov = &vvp_env_info(env)->vti_local_iov;
1081 kiocb = &vvp_env_info(env)->vti_kiocb;
1082 local_iov->iov_base = (void __user *)buf;
1083 local_iov->iov_len = count;
1084 init_sync_kiocb(kiocb, file);
1085 kiocb->ki_pos = *ppos;
1086 kiocb->ki_left = count;
1088 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1089 *ppos = kiocb->ki_pos;
1091 cl_env_put(env, &refcheck);
1097 * Write to a file (through the page cache).
1099 #ifdef HAVE_FILE_WRITEV
1100 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1101 unsigned long nr_segs, loff_t *ppos)
1104 struct vvp_io_args *args;
1110 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1114 env = cl_env_get(&refcheck);
1116 RETURN(PTR_ERR(env));
1118 args = vvp_env_args(env, IO_NORMAL);
1119 args->u.normal.via_iov = (struct iovec *)iov;
1120 args->u.normal.via_nrsegs = nr_segs;
1122 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1123 cl_env_put(env, &refcheck);
1127 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1131 struct iovec *local_iov;
1136 env = cl_env_get(&refcheck);
1138 RETURN(PTR_ERR(env));
1140 local_iov = &vvp_env_info(env)->vti_local_iov;
1141 local_iov->iov_base = (void __user *)buf;
1142 local_iov->iov_len = count;
1144 result = ll_file_writev(file, local_iov, 1, ppos);
1145 cl_env_put(env, &refcheck);
1149 #else /* AIO stuff */
1150 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1151 unsigned long nr_segs, loff_t pos)
1154 struct vvp_io_args *args;
1160 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1164 env = cl_env_get(&refcheck);
1166 RETURN(PTR_ERR(env));
1168 args = vvp_env_args(env, IO_NORMAL);
1169 args->u.normal.via_iov = (struct iovec *)iov;
1170 args->u.normal.via_nrsegs = nr_segs;
1171 args->u.normal.via_iocb = iocb;
1173 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1174 &iocb->ki_pos, count);
1175 cl_env_put(env, &refcheck);
1179 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1183 struct iovec *local_iov;
1184 struct kiocb *kiocb;
1189 env = cl_env_get(&refcheck);
1191 RETURN(PTR_ERR(env));
1193 local_iov = &vvp_env_info(env)->vti_local_iov;
1194 kiocb = &vvp_env_info(env)->vti_kiocb;
1195 local_iov->iov_base = (void __user *)buf;
1196 local_iov->iov_len = count;
1197 init_sync_kiocb(kiocb, file);
1198 kiocb->ki_pos = *ppos;
1199 kiocb->ki_left = count;
1201 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1202 *ppos = kiocb->ki_pos;
1204 cl_env_put(env, &refcheck);
1210 * Send file content (through pagecache) somewhere with helper
1212 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1213 struct pipe_inode_info *pipe, size_t count,
1217 struct vvp_io_args *args;
1222 env = cl_env_get(&refcheck);
1224 RETURN(PTR_ERR(env));
1226 args = vvp_env_args(env, IO_SPLICE);
1227 args->u.splice.via_pipe = pipe;
1228 args->u.splice.via_flags = flags;
1230 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1231 cl_env_put(env, &refcheck);
1235 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1238 struct obd_export *exp = ll_i2dtexp(inode);
1239 struct obd_trans_info oti = { 0 };
1240 struct obdo *oa = NULL;
1243 struct lov_stripe_md *lsm = NULL, *lsm2;
1250 lsm = ccc_inode_lsm_get(inode);
1251 if (!lsm_has_objects(lsm))
1252 GOTO(out, rc = -ENOENT);
1254 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1255 (lsm->lsm_stripe_count));
1257 OBD_ALLOC_LARGE(lsm2, lsm_size);
1259 GOTO(out, rc = -ENOMEM);
1262 oa->o_nlink = ost_idx;
1263 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1264 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1265 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1266 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1267 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1268 memcpy(lsm2, lsm, lsm_size);
1269 ll_inode_size_lock(inode);
1270 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1271 ll_inode_size_unlock(inode);
1273 OBD_FREE_LARGE(lsm2, lsm_size);
1276 ccc_inode_lsm_put(inode, lsm);
1281 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1283 struct ll_recreate_obj ucreat;
1287 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1290 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1294 ostid_set_seq_mdt0(&oi);
1295 ostid_set_id(&oi, ucreat.lrc_id);
1296 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1299 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1306 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1309 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1312 fid_to_ostid(&fid, &oi);
1313 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1314 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1317 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1318 int flags, struct lov_user_md *lum, int lum_size)
1320 struct lov_stripe_md *lsm = NULL;
1321 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1325 lsm = ccc_inode_lsm_get(inode);
1327 ccc_inode_lsm_put(inode, lsm);
1328 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1333 ll_inode_size_lock(inode);
1334 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1337 rc = oit.d.lustre.it_status;
1339 GOTO(out_req_free, rc);
1341 ll_release_openhandle(file->f_dentry, &oit);
1344 ll_inode_size_unlock(inode);
1345 ll_intent_release(&oit);
1346 ccc_inode_lsm_put(inode, lsm);
1349 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1353 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1354 struct lov_mds_md **lmmp, int *lmm_size,
1355 struct ptlrpc_request **request)
1357 struct ll_sb_info *sbi = ll_i2sbi(inode);
1358 struct mdt_body *body;
1359 struct lov_mds_md *lmm = NULL;
1360 struct ptlrpc_request *req = NULL;
1361 struct md_op_data *op_data;
1364 rc = ll_get_max_mdsize(sbi, &lmmsize);
1368 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1369 strlen(filename), lmmsize,
1370 LUSTRE_OPC_ANY, NULL);
1371 if (IS_ERR(op_data))
1372 RETURN(PTR_ERR(op_data));
1374 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1375 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1376 ll_finish_md_op_data(op_data);
1378 CDEBUG(D_INFO, "md_getattr_name failed "
1379 "on %s: rc %d\n", filename, rc);
1383 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1384 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1386 lmmsize = body->eadatasize;
1388 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1390 GOTO(out, rc = -ENODATA);
1393 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1394 LASSERT(lmm != NULL);
1396 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1397 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1398 GOTO(out, rc = -EPROTO);
1402 * This is coming from the MDS, so is probably in
1403 * little endian. We convert it to host endian before
1404 * passing it to userspace.
1406 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1409 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1410 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1413 /* if function called for directory - we should
1414 * avoid swab not existent lsm objects */
1415 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1416 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1417 if (S_ISREG(body->mode))
1418 lustre_swab_lov_user_md_objects(
1419 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1421 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1422 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1423 if (S_ISREG(body->mode))
1424 lustre_swab_lov_user_md_objects(
1425 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1432 *lmm_size = lmmsize;
1437 static int ll_lov_setea(struct inode *inode, struct file *file,
1440 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1441 struct lov_user_md *lump;
1442 int lum_size = sizeof(struct lov_user_md) +
1443 sizeof(struct lov_user_ost_data);
1447 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1450 OBD_ALLOC_LARGE(lump, lum_size);
1454 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1455 OBD_FREE_LARGE(lump, lum_size);
1459 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1461 OBD_FREE_LARGE(lump, lum_size);
1465 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1468 struct lov_user_md_v3 lumv3;
1469 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1470 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1471 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1473 int flags = FMODE_WRITE;
1476 /* first try with v1 which is smaller than v3 */
1477 lum_size = sizeof(struct lov_user_md_v1);
1478 if (copy_from_user(lumv1, lumv1p, lum_size))
1481 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1482 lum_size = sizeof(struct lov_user_md_v3);
1483 if (copy_from_user(&lumv3, lumv3p, lum_size))
1487 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1489 struct lov_stripe_md *lsm;
1492 put_user(0, &lumv1p->lmm_stripe_count);
1494 ll_layout_refresh(inode, &gen);
1495 lsm = ccc_inode_lsm_get(inode);
1496 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1497 0, lsm, (void *)arg);
1498 ccc_inode_lsm_put(inode, lsm);
1503 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1505 struct lov_stripe_md *lsm;
1509 lsm = ccc_inode_lsm_get(inode);
1511 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1513 ccc_inode_lsm_put(inode, lsm);
1517 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1519 struct ll_inode_info *lli = ll_i2info(inode);
1520 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1521 struct ccc_grouplock grouplock;
1525 if (ll_file_nolock(file))
1526 RETURN(-EOPNOTSUPP);
1528 spin_lock(&lli->lli_lock);
1529 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1530 CWARN("group lock already existed with gid %lu\n",
1531 fd->fd_grouplock.cg_gid);
1532 spin_unlock(&lli->lli_lock);
1535 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1536 spin_unlock(&lli->lli_lock);
1538 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1539 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1543 spin_lock(&lli->lli_lock);
1544 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1545 spin_unlock(&lli->lli_lock);
1546 CERROR("another thread just won the race\n");
1547 cl_put_grouplock(&grouplock);
1551 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1552 fd->fd_grouplock = grouplock;
1553 spin_unlock(&lli->lli_lock);
1555 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1559 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1561 struct ll_inode_info *lli = ll_i2info(inode);
1562 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1563 struct ccc_grouplock grouplock;
1566 spin_lock(&lli->lli_lock);
1567 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1568 spin_unlock(&lli->lli_lock);
1569 CWARN("no group lock held\n");
1572 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1574 if (fd->fd_grouplock.cg_gid != arg) {
1575 CWARN("group lock %lu doesn't match current id %lu\n",
1576 arg, fd->fd_grouplock.cg_gid);
1577 spin_unlock(&lli->lli_lock);
1581 grouplock = fd->fd_grouplock;
1582 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1583 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1584 spin_unlock(&lli->lli_lock);
1586 cl_put_grouplock(&grouplock);
1587 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1592 * Close inode open handle
1594 * \param dentry [in] dentry which contains the inode
1595 * \param it [in,out] intent which contains open info and result
1598 * \retval <0 failure
1600 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1602 struct inode *inode = dentry->d_inode;
1603 struct obd_client_handle *och;
1609 /* Root ? Do nothing. */
1610 if (dentry->d_inode->i_sb->s_root == dentry)
1613 /* No open handle to close? Move away */
1614 if (!it_disposition(it, DISP_OPEN_OPEN))
1617 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1619 OBD_ALLOC(och, sizeof(*och));
1621 GOTO(out, rc = -ENOMEM);
1623 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1624 ll_i2info(inode), it, och);
1626 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1629 /* this one is in place of ll_file_open */
1630 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1631 ptlrpc_req_finished(it->d.lustre.it_data);
1632 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1638 * Get size for inode for which FIEMAP mapping is requested.
1639 * Make the FIEMAP get_info call and returns the result.
1641 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1644 struct obd_export *exp = ll_i2dtexp(inode);
1645 struct lov_stripe_md *lsm = NULL;
1646 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1647 int vallen = num_bytes;
1651 /* Checks for fiemap flags */
1652 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1653 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1657 /* Check for FIEMAP_FLAG_SYNC */
1658 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1659 rc = filemap_fdatawrite(inode->i_mapping);
1664 lsm = ccc_inode_lsm_get(inode);
1668 /* If the stripe_count > 1 and the application does not understand
1669 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1671 if (lsm->lsm_stripe_count > 1 &&
1672 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1673 GOTO(out, rc = -EOPNOTSUPP);
1675 fm_key.oa.o_oi = lsm->lsm_oi;
1676 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1678 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1679 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1680 /* If filesize is 0, then there would be no objects for mapping */
1681 if (fm_key.oa.o_size == 0) {
1682 fiemap->fm_mapped_extents = 0;
1686 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1688 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1691 CERROR("obd_get_info failed: rc = %d\n", rc);
1694 ccc_inode_lsm_put(inode, lsm);
1698 int ll_fid2path(struct inode *inode, void *arg)
1700 struct obd_export *exp = ll_i2mdexp(inode);
1701 struct getinfo_fid2path *gfout, *gfin;
1705 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1706 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1709 /* Need to get the buflen */
1710 OBD_ALLOC_PTR(gfin);
1713 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1718 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1719 OBD_ALLOC(gfout, outsize);
1720 if (gfout == NULL) {
1724 memcpy(gfout, gfin, sizeof(*gfout));
1727 /* Call mdc_iocontrol */
1728 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1732 if (copy_to_user(arg, gfout, outsize))
1736 OBD_FREE(gfout, outsize);
1740 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1742 struct ll_user_fiemap *fiemap_s;
1743 size_t num_bytes, ret_bytes;
1744 unsigned int extent_count;
1747 /* Get the extent count so we can calculate the size of
1748 * required fiemap buffer */
1749 if (get_user(extent_count,
1750 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1752 num_bytes = sizeof(*fiemap_s) + (extent_count *
1753 sizeof(struct ll_fiemap_extent));
1755 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1756 if (fiemap_s == NULL)
1759 /* get the fiemap value */
1760 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1762 GOTO(error, rc = -EFAULT);
1764 /* If fm_extent_count is non-zero, read the first extent since
1765 * it is used to calculate end_offset and device from previous
1768 if (copy_from_user(&fiemap_s->fm_extents[0],
1769 (char __user *)arg + sizeof(*fiemap_s),
1770 sizeof(struct ll_fiemap_extent)))
1771 GOTO(error, rc = -EFAULT);
1774 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1778 ret_bytes = sizeof(struct ll_user_fiemap);
1780 if (extent_count != 0)
1781 ret_bytes += (fiemap_s->fm_mapped_extents *
1782 sizeof(struct ll_fiemap_extent));
1784 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1788 OBD_FREE_LARGE(fiemap_s, num_bytes);
1793 * Read the data_version for inode.
1795 * This value is computed using stripe object version on OST.
1796 * Version is computed using server side locking.
1798 * @param extent_lock Take extent lock. Not needed if a process is already
1799 * holding the OST object group locks.
1801 int ll_data_version(struct inode *inode, __u64 *data_version,
1804 struct lov_stripe_md *lsm = NULL;
1805 struct ll_sb_info *sbi = ll_i2sbi(inode);
1806 struct obdo *obdo = NULL;
1810 /* If no stripe, we consider version is 0. */
1811 lsm = ccc_inode_lsm_get(inode);
1812 if (!lsm_has_objects(lsm)) {
1814 CDEBUG(D_INODE, "No object for inode\n");
1818 OBD_ALLOC_PTR(obdo);
1820 GOTO(out, rc = -ENOMEM);
1822 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1824 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1827 *data_version = obdo->o_data_version;
1833 ccc_inode_lsm_put(inode, lsm);
1837 struct ll_swap_stack {
1838 struct iattr ia1, ia2;
1840 struct inode *inode1, *inode2;
1841 bool check_dv1, check_dv2;
1844 static int ll_swap_layouts(struct file *file1, struct file *file2,
1845 struct lustre_swap_layouts *lsl)
1847 struct mdc_swap_layouts msl;
1848 struct md_op_data *op_data;
1851 struct ll_swap_stack *llss = NULL;
1854 OBD_ALLOC_PTR(llss);
1858 llss->inode1 = file1->f_dentry->d_inode;
1859 llss->inode2 = file2->f_dentry->d_inode;
1861 if (!S_ISREG(llss->inode2->i_mode))
1862 GOTO(free, rc = -EINVAL);
1864 if (inode_permission(llss->inode1, MAY_WRITE) ||
1865 inode_permission(llss->inode2, MAY_WRITE))
1866 GOTO(free, rc = -EPERM);
1868 if (llss->inode2->i_sb != llss->inode1->i_sb)
1869 GOTO(free, rc = -EXDEV);
1871 /* we use 2 bool because it is easier to swap than 2 bits */
1872 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1873 llss->check_dv1 = true;
1875 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1876 llss->check_dv2 = true;
1878 /* we cannot use lsl->sl_dvX directly because we may swap them */
1879 llss->dv1 = lsl->sl_dv1;
1880 llss->dv2 = lsl->sl_dv2;
1882 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1883 if (rc == 0) /* same file, done! */
1886 if (rc < 0) { /* sequentialize it */
1887 swap(llss->inode1, llss->inode2);
1889 swap(llss->dv1, llss->dv2);
1890 swap(llss->check_dv1, llss->check_dv2);
1894 if (gid != 0) { /* application asks to flush dirty cache */
1895 rc = ll_get_grouplock(llss->inode1, file1, gid);
1899 rc = ll_get_grouplock(llss->inode2, file2, gid);
1901 ll_put_grouplock(llss->inode1, file1, gid);
1906 /* to be able to restore mtime and atime after swap
1907 * we need to first save them */
1909 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1910 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1911 llss->ia1.ia_atime = llss->inode1->i_atime;
1912 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1913 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1914 llss->ia2.ia_atime = llss->inode2->i_atime;
1915 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1918 /* ultimate check, before swaping the layouts we check if
1919 * dataversion has changed (if requested) */
1920 if (llss->check_dv1) {
1921 rc = ll_data_version(llss->inode1, &dv, 0);
1924 if (dv != llss->dv1)
1925 GOTO(putgl, rc = -EAGAIN);
1928 if (llss->check_dv2) {
1929 rc = ll_data_version(llss->inode2, &dv, 0);
1932 if (dv != llss->dv2)
1933 GOTO(putgl, rc = -EAGAIN);
1936 /* struct md_op_data is used to send the swap args to the mdt
1937 * only flags is missing, so we use struct mdc_swap_layouts
1938 * through the md_op_data->op_data */
1939 /* flags from user space have to be converted before they are send to
1940 * server, no flag is sent today, they are only used on the client */
1943 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1944 0, LUSTRE_OPC_ANY, &msl);
1945 if (IS_ERR(op_data))
1946 GOTO(free, rc = PTR_ERR(op_data));
1948 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1949 sizeof(*op_data), op_data, NULL);
1950 ll_finish_md_op_data(op_data);
1954 ll_put_grouplock(llss->inode2, file2, gid);
1955 ll_put_grouplock(llss->inode1, file1, gid);
1958 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1962 /* clear useless flags */
1963 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1964 llss->ia1.ia_valid &= ~ATTR_MTIME;
1965 llss->ia2.ia_valid &= ~ATTR_MTIME;
1968 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1969 llss->ia1.ia_valid &= ~ATTR_ATIME;
1970 llss->ia2.ia_valid &= ~ATTR_ATIME;
1973 /* update time if requested */
1975 if (llss->ia2.ia_valid != 0) {
1976 mutex_lock(&llss->inode1->i_mutex);
1977 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1978 mutex_unlock(&llss->inode1->i_mutex);
1981 if (llss->ia1.ia_valid != 0) {
1984 mutex_lock(&llss->inode2->i_mutex);
1985 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
1986 mutex_unlock(&llss->inode2->i_mutex);
1998 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2000 struct inode *inode = file->f_dentry->d_inode;
2001 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2005 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2006 inode->i_generation, inode, cmd);
2007 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2009 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2010 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2014 case LL_IOC_GETFLAGS:
2015 /* Get the current value of the file flags */
2016 return put_user(fd->fd_flags, (int *)arg);
2017 case LL_IOC_SETFLAGS:
2018 case LL_IOC_CLRFLAGS:
2019 /* Set or clear specific file flags */
2020 /* XXX This probably needs checks to ensure the flags are
2021 * not abused, and to handle any flag side effects.
2023 if (get_user(flags, (int *) arg))
2026 if (cmd == LL_IOC_SETFLAGS) {
2027 if ((flags & LL_FILE_IGNORE_LOCK) &&
2028 !(file->f_flags & O_DIRECT)) {
2029 CERROR("%s: unable to disable locking on "
2030 "non-O_DIRECT file\n", current->comm);
2034 fd->fd_flags |= flags;
2036 fd->fd_flags &= ~flags;
2039 case LL_IOC_LOV_SETSTRIPE:
2040 RETURN(ll_lov_setstripe(inode, file, arg));
2041 case LL_IOC_LOV_SETEA:
2042 RETURN(ll_lov_setea(inode, file, arg));
2043 case LL_IOC_LOV_SWAP_LAYOUTS: {
2045 struct lustre_swap_layouts lsl;
2047 if (copy_from_user(&lsl, (char *)arg,
2048 sizeof(struct lustre_swap_layouts)))
2051 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2054 file2 = fget(lsl.sl_fd);
2059 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2060 rc = ll_swap_layouts(file, file2, &lsl);
2064 case LL_IOC_LOV_GETSTRIPE:
2065 RETURN(ll_lov_getstripe(inode, arg));
2066 case LL_IOC_RECREATE_OBJ:
2067 RETURN(ll_lov_recreate_obj(inode, arg));
2068 case LL_IOC_RECREATE_FID:
2069 RETURN(ll_lov_recreate_fid(inode, arg));
2070 case FSFILT_IOC_FIEMAP:
2071 RETURN(ll_ioctl_fiemap(inode, arg));
2072 case FSFILT_IOC_GETFLAGS:
2073 case FSFILT_IOC_SETFLAGS:
2074 RETURN(ll_iocontrol(inode, file, cmd, arg));
2075 case FSFILT_IOC_GETVERSION_OLD:
2076 case FSFILT_IOC_GETVERSION:
2077 RETURN(put_user(inode->i_generation, (int *)arg));
2078 case LL_IOC_GROUP_LOCK:
2079 RETURN(ll_get_grouplock(inode, file, arg));
2080 case LL_IOC_GROUP_UNLOCK:
2081 RETURN(ll_put_grouplock(inode, file, arg));
2082 case IOC_OBD_STATFS:
2083 RETURN(ll_obd_statfs(inode, (void *)arg));
2085 /* We need to special case any other ioctls we want to handle,
2086 * to send them to the MDS/OST as appropriate and to properly
2087 * network encode the arg field.
2088 case FSFILT_IOC_SETVERSION_OLD:
2089 case FSFILT_IOC_SETVERSION:
2091 case LL_IOC_FLUSHCTX:
2092 RETURN(ll_flush_ctx(inode));
2093 case LL_IOC_PATH2FID: {
2094 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2095 sizeof(struct lu_fid)))
2100 case OBD_IOC_FID2PATH:
2101 RETURN(ll_fid2path(inode, (void *)arg));
2102 case LL_IOC_DATA_VERSION: {
2103 struct ioc_data_version idv;
2106 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2109 rc = ll_data_version(inode, &idv.idv_version,
2110 !(idv.idv_flags & LL_DV_NOFLUSH));
2112 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2118 case LL_IOC_GET_MDTIDX: {
2121 mdtidx = ll_get_mdt_idx(inode);
2125 if (put_user((int)mdtidx, (int*)arg))
2130 case OBD_IOC_GETDTNAME:
2131 case OBD_IOC_GETMDNAME:
2132 RETURN(ll_get_obd_name(inode, cmd, arg));
2133 case LL_IOC_HSM_STATE_GET: {
2134 struct md_op_data *op_data;
2135 struct hsm_user_state *hus;
2142 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2143 LUSTRE_OPC_ANY, hus);
2144 if (IS_ERR(op_data)) {
2146 RETURN(PTR_ERR(op_data));
2149 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2152 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2155 ll_finish_md_op_data(op_data);
2159 case LL_IOC_HSM_STATE_SET: {
2160 struct md_op_data *op_data;
2161 struct hsm_state_set *hss;
2167 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2172 /* Non-root users are forbidden to set or clear flags which are
2173 * NOT defined in HSM_USER_MASK. */
2174 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2175 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2180 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2181 LUSTRE_OPC_ANY, hss);
2182 if (IS_ERR(op_data)) {
2184 RETURN(PTR_ERR(op_data));
2187 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2190 ll_finish_md_op_data(op_data);
2195 case LL_IOC_HSM_ACTION: {
2196 struct md_op_data *op_data;
2197 struct hsm_current_action *hca;
2204 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2205 LUSTRE_OPC_ANY, hca);
2206 if (IS_ERR(op_data)) {
2208 RETURN(PTR_ERR(op_data));
2211 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2214 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2217 ll_finish_md_op_data(op_data);
2225 ll_iocontrol_call(inode, file, cmd, arg, &err))
2228 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2234 #ifndef HAVE_FILE_LLSEEK_SIZE
2235 static inline loff_t
2236 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2238 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2240 if (offset > maxsize)
2243 if (offset != file->f_pos) {
2244 file->f_pos = offset;
2245 file->f_version = 0;
2251 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2252 loff_t maxsize, loff_t eof)
2254 struct inode *inode = file->f_dentry->d_inode;
2262 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2263 * position-querying operation. Avoid rewriting the "same"
2264 * f_pos value back to the file because a concurrent read(),
2265 * write() or lseek() might have altered it
2270 * f_lock protects against read/modify/write race with other
2271 * SEEK_CURs. Note that parallel writes and reads behave
2274 mutex_lock(&inode->i_mutex);
2275 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2276 mutex_unlock(&inode->i_mutex);
2280 * In the generic case the entire file is data, so as long as
2281 * offset isn't at the end of the file then the offset is data.
2288 * There is a virtual hole at the end of the file, so as long as
2289 * offset isn't i_size or larger, return i_size.
2297 return llseek_execute(file, offset, maxsize);
2301 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2303 struct inode *inode = file->f_dentry->d_inode;
2304 loff_t retval, eof = 0;
2307 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2308 (origin == SEEK_CUR) ? file->f_pos : 0);
2309 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2310 inode->i_ino, inode->i_generation, inode, retval, retval,
2312 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2314 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2315 retval = ll_glimpse_size(inode);
2318 eof = i_size_read(inode);
2321 retval = ll_generic_file_llseek_size(file, offset, origin,
2322 ll_file_maxbytes(inode), eof);
2326 int ll_flush(struct file *file, fl_owner_t id)
2328 struct inode *inode = file->f_dentry->d_inode;
2329 struct ll_inode_info *lli = ll_i2info(inode);
2330 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2333 LASSERT(!S_ISDIR(inode->i_mode));
2335 /* catch async errors that were recorded back when async writeback
2336 * failed for pages in this mapping. */
2337 rc = lli->lli_async_rc;
2338 lli->lli_async_rc = 0;
2339 err = lov_read_and_clear_async_rc(lli->lli_clob);
2343 /* The application has been told write failure already.
2344 * Do not report failure again. */
2345 if (fd->fd_write_failed)
2347 return rc ? -EIO : 0;
2351 * Called to make sure a portion of file has been written out.
2352 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2354 * Return how many pages have been written.
2356 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2357 enum cl_fsync_mode mode, int ignore_layout)
2359 struct cl_env_nest nest;
2362 struct obd_capa *capa = NULL;
2363 struct cl_fsync_io *fio;
2367 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2368 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2371 env = cl_env_nested_get(&nest);
2373 RETURN(PTR_ERR(env));
2375 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2377 io = ccc_env_thread_io(env);
2378 io->ci_obj = cl_i2info(inode)->lli_clob;
2379 io->ci_ignore_layout = ignore_layout;
2381 /* initialize parameters for sync */
2382 fio = &io->u.ci_fsync;
2383 fio->fi_capa = capa;
2384 fio->fi_start = start;
2386 fio->fi_fid = ll_inode2fid(inode);
2387 fio->fi_mode = mode;
2388 fio->fi_nr_written = 0;
2390 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2391 result = cl_io_loop(env, io);
2393 result = io->ci_result;
2395 result = fio->fi_nr_written;
2396 cl_io_fini(env, io);
2397 cl_env_nested_put(&nest, env);
2405 * When dentry is provided (the 'else' case), *file->f_dentry may be
2406 * null and dentry must be used directly rather than pulled from
2407 * *file->f_dentry as is done otherwise.
2410 #ifdef HAVE_FILE_FSYNC_4ARGS
2411 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2413 struct dentry *dentry = file->f_dentry;
2414 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2415 int ll_fsync(struct file *file, int datasync)
2417 struct dentry *dentry = file->f_dentry;
2419 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2422 struct inode *inode = dentry->d_inode;
2423 struct ll_inode_info *lli = ll_i2info(inode);
2424 struct ptlrpc_request *req;
2425 struct obd_capa *oc;
2429 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2430 inode->i_generation, inode);
2431 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2433 #ifdef HAVE_FILE_FSYNC_4ARGS
2434 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2435 mutex_lock(&inode->i_mutex);
2437 /* fsync's caller has already called _fdata{sync,write}, we want
2438 * that IO to finish before calling the osc and mdc sync methods */
2439 rc = filemap_fdatawait(inode->i_mapping);
2442 /* catch async errors that were recorded back when async writeback
2443 * failed for pages in this mapping. */
2444 if (!S_ISDIR(inode->i_mode)) {
2445 err = lli->lli_async_rc;
2446 lli->lli_async_rc = 0;
2449 err = lov_read_and_clear_async_rc(lli->lli_clob);
2454 oc = ll_mdscapa_get(inode);
2455 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2461 ptlrpc_req_finished(req);
2463 if (datasync && S_ISREG(inode->i_mode)) {
2464 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2466 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2468 if (rc == 0 && err < 0)
2471 fd->fd_write_failed = true;
2473 fd->fd_write_failed = false;
2476 #ifdef HAVE_FILE_FSYNC_4ARGS
2477 mutex_unlock(&inode->i_mutex);
2482 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2484 struct inode *inode = file->f_dentry->d_inode;
2485 struct ll_sb_info *sbi = ll_i2sbi(inode);
2486 struct ldlm_enqueue_info einfo = {
2487 .ei_type = LDLM_FLOCK,
2488 .ei_cb_cp = ldlm_flock_completion_ast,
2489 .ei_cbdata = file_lock,
2491 struct md_op_data *op_data;
2492 struct lustre_handle lockh = {0};
2493 ldlm_policy_data_t flock = {{0}};
2499 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2500 inode->i_ino, file_lock);
2502 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2504 if (file_lock->fl_flags & FL_FLOCK) {
2505 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2506 /* flocks are whole-file locks */
2507 flock.l_flock.end = OFFSET_MAX;
2508 /* For flocks owner is determined by the local file desctiptor*/
2509 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2510 } else if (file_lock->fl_flags & FL_POSIX) {
2511 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2512 flock.l_flock.start = file_lock->fl_start;
2513 flock.l_flock.end = file_lock->fl_end;
2517 flock.l_flock.pid = file_lock->fl_pid;
2519 /* Somewhat ugly workaround for svc lockd.
2520 * lockd installs custom fl_lmops->lm_compare_owner that checks
2521 * for the fl_owner to be the same (which it always is on local node
2522 * I guess between lockd processes) and then compares pid.
2523 * As such we assign pid to the owner field to make it all work,
2524 * conflict with normal locks is unlikely since pid space and
2525 * pointer space for current->files are not intersecting */
2526 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2527 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2529 switch (file_lock->fl_type) {
2531 einfo.ei_mode = LCK_PR;
2534 /* An unlock request may or may not have any relation to
2535 * existing locks so we may not be able to pass a lock handle
2536 * via a normal ldlm_lock_cancel() request. The request may even
2537 * unlock a byte range in the middle of an existing lock. In
2538 * order to process an unlock request we need all of the same
2539 * information that is given with a normal read or write record
2540 * lock request. To avoid creating another ldlm unlock (cancel)
2541 * message we'll treat a LCK_NL flock request as an unlock. */
2542 einfo.ei_mode = LCK_NL;
2545 einfo.ei_mode = LCK_PW;
2548 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2549 file_lock->fl_type);
2564 flags = LDLM_FL_BLOCK_NOWAIT;
2570 flags = LDLM_FL_TEST_LOCK;
2571 /* Save the old mode so that if the mode in the lock changes we
2572 * can decrement the appropriate reader or writer refcount. */
2573 file_lock->fl_type = einfo.ei_mode;
2576 CERROR("unknown fcntl lock command: %d\n", cmd);
2580 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2581 LUSTRE_OPC_ANY, NULL);
2582 if (IS_ERR(op_data))
2583 RETURN(PTR_ERR(op_data));
2585 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2586 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2587 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2589 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2590 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2592 if ((file_lock->fl_flags & FL_FLOCK) &&
2593 (rc == 0 || file_lock->fl_type == F_UNLCK))
2594 rc2 = flock_lock_file_wait(file, file_lock);
2595 if ((file_lock->fl_flags & FL_POSIX) &&
2596 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2597 !(flags & LDLM_FL_TEST_LOCK))
2598 rc2 = posix_lock_file_wait(file, file_lock);
2600 if (rc2 && file_lock->fl_type != F_UNLCK) {
2601 einfo.ei_mode = LCK_NL;
2602 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2603 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2607 ll_finish_md_op_data(op_data);
2612 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2620 * test if some locks matching bits and l_req_mode are acquired
2621 * - bits can be in different locks
2622 * - if found clear the common lock bits in *bits
2623 * - the bits not found, are kept in *bits
2625 * \param bits [IN] searched lock bits [IN]
2626 * \param l_req_mode [IN] searched lock mode
2627 * \retval boolean, true iff all bits are found
2629 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2631 struct lustre_handle lockh;
2632 ldlm_policy_data_t policy;
2633 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2634 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2643 fid = &ll_i2info(inode)->lli_fid;
2644 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2645 ldlm_lockname[mode]);
2647 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2648 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2649 policy.l_inodebits.bits = *bits & (1 << i);
2650 if (policy.l_inodebits.bits == 0)
2653 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2654 &policy, mode, &lockh)) {
2655 struct ldlm_lock *lock;
2657 lock = ldlm_handle2lock(&lockh);
2660 ~(lock->l_policy_data.l_inodebits.bits);
2661 LDLM_LOCK_PUT(lock);
2663 *bits &= ~policy.l_inodebits.bits;
2670 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2671 struct lustre_handle *lockh, __u64 flags)
2673 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2678 fid = &ll_i2info(inode)->lli_fid;
2679 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2681 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2682 fid, LDLM_IBITS, &policy,
2683 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2687 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2689 /* Already unlinked. Just update nlink and return success */
2690 if (rc == -ENOENT) {
2692 /* This path cannot be hit for regular files unless in
2693 * case of obscure races, so no need to to validate
2695 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2697 } else if (rc != 0) {
2698 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2699 ll_get_fsname(inode->i_sb, NULL, 0),
2700 PFID(ll_inode2fid(inode)), rc);
2706 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2709 struct inode *inode = dentry->d_inode;
2710 struct ptlrpc_request *req = NULL;
2711 struct obd_export *exp;
2715 LASSERT(inode != NULL);
2717 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2718 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2720 exp = ll_i2mdexp(inode);
2722 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2723 * But under CMD case, it caused some lock issues, should be fixed
2724 * with new CMD ibits lock. See bug 12718 */
2725 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2726 struct lookup_intent oit = { .it_op = IT_GETATTR };
2727 struct md_op_data *op_data;
2729 if (ibits == MDS_INODELOCK_LOOKUP)
2730 oit.it_op = IT_LOOKUP;
2732 /* Call getattr by fid, so do not provide name at all. */
2733 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2734 dentry->d_inode, NULL, 0, 0,
2735 LUSTRE_OPC_ANY, NULL);
2736 if (IS_ERR(op_data))
2737 RETURN(PTR_ERR(op_data));
2739 oit.it_create_mode |= M_CHECK_STALE;
2740 rc = md_intent_lock(exp, op_data, NULL, 0,
2741 /* we are not interested in name
2744 ll_md_blocking_ast, 0);
2745 ll_finish_md_op_data(op_data);
2746 oit.it_create_mode &= ~M_CHECK_STALE;
2748 rc = ll_inode_revalidate_fini(inode, rc);
2752 rc = ll_revalidate_it_finish(req, &oit, dentry);
2754 ll_intent_release(&oit);
2758 /* Unlinked? Unhash dentry, so it is not picked up later by
2759 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2760 here to preserve get_cwd functionality on 2.6.
2762 if (!dentry->d_inode->i_nlink)
2763 d_lustre_invalidate(dentry, 0);
2765 ll_lookup_finish_locks(&oit, dentry);
2766 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2767 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2768 obd_valid valid = OBD_MD_FLGETATTR;
2769 struct md_op_data *op_data;
2772 if (S_ISREG(inode->i_mode)) {
2773 rc = ll_get_max_mdsize(sbi, &ealen);
2776 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2779 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2780 0, ealen, LUSTRE_OPC_ANY,
2782 if (IS_ERR(op_data))
2783 RETURN(PTR_ERR(op_data));
2785 op_data->op_valid = valid;
2786 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2787 * capa for this inode. Because we only keep capas of dirs
2789 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2790 ll_finish_md_op_data(op_data);
2792 rc = ll_inode_revalidate_fini(inode, rc);
2796 rc = ll_prep_inode(&inode, req, NULL, NULL);
2799 ptlrpc_req_finished(req);
2803 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2806 struct inode *inode = dentry->d_inode;
2810 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2814 /* if object isn't regular file, don't validate size */
2815 if (!S_ISREG(inode->i_mode)) {
2816 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2817 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2818 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2820 /* In case of restore, the MDT has the right size and has
2821 * already send it back without granting the layout lock,
2822 * inode is up-to-date so glimpse is useless.
2823 * Also to glimpse we need the layout, in case of a running
2824 * restore the MDT holds the layout lock so the glimpse will
2825 * block up to the end of restore (getattr will block)
2827 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2828 rc = ll_glimpse_size(inode);
2833 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2834 struct lookup_intent *it, struct kstat *stat)
2836 struct inode *inode = de->d_inode;
2837 struct ll_sb_info *sbi = ll_i2sbi(inode);
2838 struct ll_inode_info *lli = ll_i2info(inode);
2841 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2842 MDS_INODELOCK_LOOKUP);
2843 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2848 stat->dev = inode->i_sb->s_dev;
2849 if (ll_need_32bit_api(sbi))
2850 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2852 stat->ino = inode->i_ino;
2853 stat->mode = inode->i_mode;
2854 stat->nlink = inode->i_nlink;
2855 stat->uid = inode->i_uid;
2856 stat->gid = inode->i_gid;
2857 stat->rdev = inode->i_rdev;
2858 stat->atime = inode->i_atime;
2859 stat->mtime = inode->i_mtime;
2860 stat->ctime = inode->i_ctime;
2861 stat->blksize = 1 << inode->i_blkbits;
2863 stat->size = i_size_read(inode);
2864 stat->blocks = inode->i_blocks;
2868 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2870 struct lookup_intent it = { .it_op = IT_GETATTR };
2872 return ll_getattr_it(mnt, de, &it, stat);
2875 #ifdef HAVE_LINUX_FIEMAP_H
2876 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2877 __u64 start, __u64 len)
2881 struct ll_user_fiemap *fiemap;
2882 unsigned int extent_count = fieinfo->fi_extents_max;
2884 num_bytes = sizeof(*fiemap) + (extent_count *
2885 sizeof(struct ll_fiemap_extent));
2886 OBD_ALLOC_LARGE(fiemap, num_bytes);
2891 fiemap->fm_flags = fieinfo->fi_flags;
2892 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2893 fiemap->fm_start = start;
2894 fiemap->fm_length = len;
2895 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2896 sizeof(struct ll_fiemap_extent));
2898 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2900 fieinfo->fi_flags = fiemap->fm_flags;
2901 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2902 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2903 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2905 OBD_FREE_LARGE(fiemap, num_bytes);
2910 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2912 struct ll_inode_info *lli = ll_i2info(inode);
2913 struct posix_acl *acl = NULL;
2916 spin_lock(&lli->lli_lock);
2917 /* VFS' acl_permission_check->check_acl will release the refcount */
2918 acl = posix_acl_dup(lli->lli_posix_acl);
2919 spin_unlock(&lli->lli_lock);
2924 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2926 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2927 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2929 ll_check_acl(struct inode *inode, int mask)
2932 # ifdef CONFIG_FS_POSIX_ACL
2933 struct posix_acl *acl;
2937 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2938 if (flags & IPERM_FLAG_RCU)
2941 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2946 rc = posix_acl_permission(inode, acl, mask);
2947 posix_acl_release(acl);
2950 # else /* !CONFIG_FS_POSIX_ACL */
2952 # endif /* CONFIG_FS_POSIX_ACL */
2954 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2956 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2957 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2959 # ifdef HAVE_INODE_PERMISION_2ARGS
2960 int ll_inode_permission(struct inode *inode, int mask)
2962 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2969 #ifdef MAY_NOT_BLOCK
2970 if (mask & MAY_NOT_BLOCK)
2972 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2973 if (flags & IPERM_FLAG_RCU)
2977 /* as root inode are NOT getting validated in lookup operation,
2978 * need to do it before permission check. */
2980 if (inode == inode->i_sb->s_root->d_inode) {
2981 struct lookup_intent it = { .it_op = IT_LOOKUP };
2983 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2984 MDS_INODELOCK_LOOKUP);
2989 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2990 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2992 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2993 return lustre_check_remote_perm(inode, mask);
2995 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2996 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3001 #ifdef HAVE_FILE_READV
3002 #define READ_METHOD readv
3003 #define READ_FUNCTION ll_file_readv
3004 #define WRITE_METHOD writev
3005 #define WRITE_FUNCTION ll_file_writev
3007 #define READ_METHOD aio_read
3008 #define READ_FUNCTION ll_file_aio_read
3009 #define WRITE_METHOD aio_write
3010 #define WRITE_FUNCTION ll_file_aio_write
3013 /* -o localflock - only provides locally consistent flock locks */
3014 struct file_operations ll_file_operations = {
3015 .read = ll_file_read,
3016 .READ_METHOD = READ_FUNCTION,
3017 .write = ll_file_write,
3018 .WRITE_METHOD = WRITE_FUNCTION,
3019 .unlocked_ioctl = ll_file_ioctl,
3020 .open = ll_file_open,
3021 .release = ll_file_release,
3022 .mmap = ll_file_mmap,
3023 .llseek = ll_file_seek,
3024 .splice_read = ll_file_splice_read,
3029 struct file_operations ll_file_operations_flock = {
3030 .read = ll_file_read,
3031 .READ_METHOD = READ_FUNCTION,
3032 .write = ll_file_write,
3033 .WRITE_METHOD = WRITE_FUNCTION,
3034 .unlocked_ioctl = ll_file_ioctl,
3035 .open = ll_file_open,
3036 .release = ll_file_release,
3037 .mmap = ll_file_mmap,
3038 .llseek = ll_file_seek,
3039 .splice_read = ll_file_splice_read,
3042 .flock = ll_file_flock,
3043 .lock = ll_file_flock
3046 /* These are for -o noflock - to return ENOSYS on flock calls */
3047 struct file_operations ll_file_operations_noflock = {
3048 .read = ll_file_read,
3049 .READ_METHOD = READ_FUNCTION,
3050 .write = ll_file_write,
3051 .WRITE_METHOD = WRITE_FUNCTION,
3052 .unlocked_ioctl = ll_file_ioctl,
3053 .open = ll_file_open,
3054 .release = ll_file_release,
3055 .mmap = ll_file_mmap,
3056 .llseek = ll_file_seek,
3057 .splice_read = ll_file_splice_read,
3060 .flock = ll_file_noflock,
3061 .lock = ll_file_noflock
3064 struct inode_operations ll_file_inode_operations = {
3065 .setattr = ll_setattr,
3066 .getattr = ll_getattr,
3067 .permission = ll_inode_permission,
3068 .setxattr = ll_setxattr,
3069 .getxattr = ll_getxattr,
3070 .listxattr = ll_listxattr,
3071 .removexattr = ll_removexattr,
3072 #ifdef HAVE_LINUX_FIEMAP_H
3073 .fiemap = ll_fiemap,
3075 #ifdef HAVE_IOP_GET_ACL
3076 .get_acl = ll_get_acl,
3080 /* dynamic ioctl number support routins */
3081 static struct llioc_ctl_data {
3082 struct rw_semaphore ioc_sem;
3083 cfs_list_t ioc_head;
3085 __RWSEM_INITIALIZER(llioc.ioc_sem),
3086 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3091 cfs_list_t iocd_list;
3092 unsigned int iocd_size;
3093 llioc_callback_t iocd_cb;
3094 unsigned int iocd_count;
3095 unsigned int iocd_cmd[0];
3098 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3101 struct llioc_data *in_data = NULL;
3104 if (cb == NULL || cmd == NULL ||
3105 count > LLIOC_MAX_CMD || count < 0)
3108 size = sizeof(*in_data) + count * sizeof(unsigned int);
3109 OBD_ALLOC(in_data, size);
3110 if (in_data == NULL)
3113 memset(in_data, 0, sizeof(*in_data));
3114 in_data->iocd_size = size;
3115 in_data->iocd_cb = cb;
3116 in_data->iocd_count = count;
3117 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3119 down_write(&llioc.ioc_sem);
3120 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3121 up_write(&llioc.ioc_sem);
3126 void ll_iocontrol_unregister(void *magic)
3128 struct llioc_data *tmp;
3133 down_write(&llioc.ioc_sem);
3134 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3136 unsigned int size = tmp->iocd_size;
3138 cfs_list_del(&tmp->iocd_list);
3139 up_write(&llioc.ioc_sem);
3141 OBD_FREE(tmp, size);
3145 up_write(&llioc.ioc_sem);
3147 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3150 EXPORT_SYMBOL(ll_iocontrol_register);
3151 EXPORT_SYMBOL(ll_iocontrol_unregister);
3153 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3154 unsigned int cmd, unsigned long arg, int *rcp)
3156 enum llioc_iter ret = LLIOC_CONT;
3157 struct llioc_data *data;
3158 int rc = -EINVAL, i;
3160 down_read(&llioc.ioc_sem);
3161 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3162 for (i = 0; i < data->iocd_count; i++) {
3163 if (cmd != data->iocd_cmd[i])
3166 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3170 if (ret == LLIOC_STOP)
3173 up_read(&llioc.ioc_sem);
3180 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3182 struct ll_inode_info *lli = ll_i2info(inode);
3183 struct cl_env_nest nest;
3188 if (lli->lli_clob == NULL)
3191 env = cl_env_nested_get(&nest);
3193 RETURN(PTR_ERR(env));
3195 result = cl_conf_set(env, lli->lli_clob, conf);
3196 cl_env_nested_put(&nest, env);
3198 if (conf->coc_opc == OBJECT_CONF_SET) {
3199 struct ldlm_lock *lock = conf->coc_lock;
3201 LASSERT(lock != NULL);
3202 LASSERT(ldlm_has_layout(lock));
3204 /* it can only be allowed to match after layout is
3205 * applied to inode otherwise false layout would be
3206 * seen. Applying layout shoud happen before dropping
3207 * the intent lock. */
3208 ldlm_lock_allow_match(lock);
3214 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3215 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3218 struct ll_sb_info *sbi = ll_i2sbi(inode);
3219 struct obd_capa *oc;
3220 struct ptlrpc_request *req;
3221 struct mdt_body *body;
3228 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3229 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3230 lock->l_lvb_data, lock->l_lvb_len);
3232 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3235 /* if layout lock was granted right away, the layout is returned
3236 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3237 * blocked and then granted via completion ast, we have to fetch
3238 * layout here. Please note that we can't use the LVB buffer in
3239 * completion AST because it doesn't have a large enough buffer */
3240 oc = ll_mdscapa_get(inode);
3241 rc = ll_get_max_mdsize(sbi, &lmmsize);
3243 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3244 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3250 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3251 if (body == NULL || body->eadatasize > lmmsize)
3252 GOTO(out, rc = -EPROTO);
3254 lmmsize = body->eadatasize;
3255 if (lmmsize == 0) /* empty layout */
3258 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3260 GOTO(out, rc = -EFAULT);
3262 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3263 if (lvbdata == NULL)
3264 GOTO(out, rc = -ENOMEM);
3266 memcpy(lvbdata, lmm, lmmsize);
3267 lock_res_and_lock(lock);
3268 if (lock->l_lvb_data != NULL)
3269 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3271 lock->l_lvb_data = lvbdata;
3272 lock->l_lvb_len = lmmsize;
3273 unlock_res_and_lock(lock);
3278 ptlrpc_req_finished(req);
3283 * Apply the layout to the inode. Layout lock is held and will be released
3286 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3287 struct inode *inode, __u32 *gen, bool reconf)
3289 struct ll_inode_info *lli = ll_i2info(inode);
3290 struct ll_sb_info *sbi = ll_i2sbi(inode);
3291 struct ldlm_lock *lock;
3292 struct lustre_md md = { NULL };
3293 struct cl_object_conf conf;
3296 bool wait_layout = false;
3299 LASSERT(lustre_handle_is_used(lockh));
3301 lock = ldlm_handle2lock(lockh);
3302 LASSERT(lock != NULL);
3303 LASSERT(ldlm_has_layout(lock));
3305 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3306 inode, PFID(&lli->lli_fid), reconf);
3308 /* in case this is a caching lock and reinstate with new inode */
3309 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3311 lock_res_and_lock(lock);
3312 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3313 unlock_res_and_lock(lock);
3314 /* checking lvb_ready is racy but this is okay. The worst case is
3315 * that multi processes may configure the file on the same time. */
3317 if (lvb_ready || !reconf) {
3320 /* layout_gen must be valid if layout lock is not
3321 * cancelled and stripe has already set */
3322 *gen = lli->lli_layout_gen;
3328 rc = ll_layout_fetch(inode, lock);
3332 /* for layout lock, lmm is returned in lock's lvb.
3333 * lvb_data is immutable if the lock is held so it's safe to access it
3334 * without res lock. See the description in ldlm_lock_decref_internal()
3335 * for the condition to free lvb_data of layout lock */
3336 if (lock->l_lvb_data != NULL) {
3337 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3338 lock->l_lvb_data, lock->l_lvb_len);
3340 *gen = LL_LAYOUT_GEN_EMPTY;
3342 *gen = md.lsm->lsm_layout_gen;
3345 CERROR("%s: file "DFID" unpackmd error: %d\n",
3346 ll_get_fsname(inode->i_sb, NULL, 0),
3347 PFID(&lli->lli_fid), rc);
3353 /* set layout to file. Unlikely this will fail as old layout was
3354 * surely eliminated */
3355 memset(&conf, 0, sizeof conf);
3356 conf.coc_opc = OBJECT_CONF_SET;
3357 conf.coc_inode = inode;
3358 conf.coc_lock = lock;
3359 conf.u.coc_md = &md;
3360 rc = ll_layout_conf(inode, &conf);
3363 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3365 /* refresh layout failed, need to wait */
3366 wait_layout = rc == -EBUSY;
3370 LDLM_LOCK_PUT(lock);
3371 ldlm_lock_decref(lockh, mode);
3373 /* wait for IO to complete if it's still being used. */
3375 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3376 ll_get_fsname(inode->i_sb, NULL, 0),
3377 inode, PFID(&lli->lli_fid));
3379 memset(&conf, 0, sizeof conf);
3380 conf.coc_opc = OBJECT_CONF_WAIT;
3381 conf.coc_inode = inode;
3382 rc = ll_layout_conf(inode, &conf);
3386 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3387 PFID(&lli->lli_fid), rc);
3393 * This function checks if there exists a LAYOUT lock on the client side,
3394 * or enqueues it if it doesn't have one in cache.
3396 * This function will not hold layout lock so it may be revoked any time after
3397 * this function returns. Any operations depend on layout should be redone
3400 * This function should be called before lov_io_init() to get an uptodate
3401 * layout version, the caller should save the version number and after IO
3402 * is finished, this function should be called again to verify that layout
3403 * is not changed during IO time.
3405 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3407 struct ll_inode_info *lli = ll_i2info(inode);
3408 struct ll_sb_info *sbi = ll_i2sbi(inode);
3409 struct md_op_data *op_data;
3410 struct lookup_intent it;
3411 struct lustre_handle lockh;
3413 struct ldlm_enqueue_info einfo = {
3414 .ei_type = LDLM_IBITS,
3416 .ei_cb_bl = ll_md_blocking_ast,
3417 .ei_cb_cp = ldlm_completion_ast,
3422 *gen = lli->lli_layout_gen;
3423 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3427 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3428 LASSERT(S_ISREG(inode->i_mode));
3430 /* mostly layout lock is caching on the local side, so try to match
3431 * it before grabbing layout lock mutex. */
3432 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3433 if (mode != 0) { /* hit cached lock */
3434 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3438 /* better hold lli_layout_mutex to try again otherwise
3439 * it will have starvation problem. */
3442 /* take layout lock mutex to enqueue layout lock exclusively. */
3443 mutex_lock(&lli->lli_layout_mutex);
3446 /* try again. Maybe somebody else has done this. */
3447 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3448 if (mode != 0) { /* hit cached lock */
3449 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3453 mutex_unlock(&lli->lli_layout_mutex);
3457 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3458 0, 0, LUSTRE_OPC_ANY, NULL);
3459 if (IS_ERR(op_data)) {
3460 mutex_unlock(&lli->lli_layout_mutex);
3461 RETURN(PTR_ERR(op_data));
3464 /* have to enqueue one */
3465 memset(&it, 0, sizeof(it));
3466 it.it_op = IT_LAYOUT;
3467 lockh.cookie = 0ULL;
3469 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3470 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3471 PFID(&lli->lli_fid));
3473 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3475 if (it.d.lustre.it_data != NULL)
3476 ptlrpc_req_finished(it.d.lustre.it_data);
3477 it.d.lustre.it_data = NULL;
3479 ll_finish_md_op_data(op_data);
3481 mode = it.d.lustre.it_lock_mode;
3482 it.d.lustre.it_lock_mode = 0;
3483 ll_intent_drop_lock(&it);
3486 /* set lock data in case this is a new lock */
3487 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3488 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3492 mutex_unlock(&lli->lli_layout_mutex);
3498 * This function send a restore request to the MDT
3500 int ll_layout_restore(struct inode *inode)
3502 struct hsm_user_request *hur;
3506 len = sizeof(struct hsm_user_request) +
3507 sizeof(struct hsm_user_item);
3508 OBD_ALLOC(hur, len);
3512 hur->hur_request.hr_action = HUA_RESTORE;
3513 hur->hur_request.hr_archive_id = 0;
3514 hur->hur_request.hr_flags = 0;
3515 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3516 sizeof(hur->hur_user_item[0].hui_fid));
3517 hur->hur_user_item[0].hui_extent.length = -1;
3518 hur->hur_request.hr_itemcount = 1;
3519 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,