4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
58 fd->fd_write_failed = false;
62 static void ll_file_data_put(struct ll_file_data *fd)
65 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69 struct lustre_handle *fh)
71 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72 op_data->op_attr.ia_mode = inode->i_mode;
73 op_data->op_attr.ia_atime = inode->i_atime;
74 op_data->op_attr.ia_mtime = inode->i_mtime;
75 op_data->op_attr.ia_ctime = inode->i_ctime;
76 op_data->op_attr.ia_size = i_size_read(inode);
77 op_data->op_attr_blocks = inode->i_blocks;
78 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
79 ll_inode_to_ext_flags(inode->i_flags);
80 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
82 op_data->op_handle = *fh;
83 op_data->op_capa1 = ll_mdscapa_get(inode);
85 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
86 op_data->op_bias |= MDS_DATA_MODIFIED;
90 * Closes the IO epoch and packs all the attributes into @op_data for
93 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
99 ATTR_MTIME_SET | ATTR_CTIME_SET;
101 if (!(och->och_flags & FMODE_WRITE))
104 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
105 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
107 ll_ioepoch_close(inode, op_data, &och, 0);
110 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
111 ll_prep_md_op_data(op_data, inode, NULL, NULL,
112 0, 0, LUSTRE_OPC_ANY, NULL);
116 static int ll_close_inode_openhandle(struct obd_export *md_exp,
118 struct obd_client_handle *och)
120 struct obd_export *exp = ll_i2mdexp(inode);
121 struct md_op_data *op_data;
122 struct ptlrpc_request *req = NULL;
123 struct obd_device *obd = class_exp2obd(exp);
130 * XXX: in case of LMV, is this correct to access
133 CERROR("Invalid MDC connection handle "LPX64"\n",
134 ll_i2mdexp(inode)->exp_handle.h_cookie);
138 OBD_ALLOC_PTR(op_data);
140 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
142 ll_prepare_close(inode, op_data, och);
143 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
144 rc = md_close(md_exp, op_data, och->och_mod, &req);
146 /* This close must have the epoch closed. */
147 LASSERT(epoch_close);
148 /* MDS has instructed us to obtain Size-on-MDS attribute from
149 * OSTs and send setattr to back to MDS. */
150 rc = ll_som_update(inode, op_data);
152 CERROR("inode %lu mdc Size-on-MDS update failed: "
153 "rc = %d\n", inode->i_ino, rc);
157 CERROR("inode %lu mdc close failed: rc = %d\n",
161 /* DATA_MODIFIED flag was successfully sent on close, cancel data
162 * modification flag. */
163 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
164 struct ll_inode_info *lli = ll_i2info(inode);
166 spin_lock(&lli->lli_lock);
167 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
168 spin_unlock(&lli->lli_lock);
171 ll_finish_md_op_data(op_data);
174 rc = ll_objects_destroy(req, inode);
176 CERROR("inode %lu ll_objects destroy: rc = %d\n",
183 if (exp_connect_som(exp) && !epoch_close &&
184 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
185 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
187 md_clear_open_replay_data(md_exp, och);
188 /* Free @och if it is not waiting for DONE_WRITING. */
189 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
192 if (req) /* This is close request */
193 ptlrpc_req_finished(req);
197 int ll_md_real_close(struct inode *inode, int flags)
199 struct ll_inode_info *lli = ll_i2info(inode);
200 struct obd_client_handle **och_p;
201 struct obd_client_handle *och;
206 if (flags & FMODE_WRITE) {
207 och_p = &lli->lli_mds_write_och;
208 och_usecount = &lli->lli_open_fd_write_count;
209 } else if (flags & FMODE_EXEC) {
210 och_p = &lli->lli_mds_exec_och;
211 och_usecount = &lli->lli_open_fd_exec_count;
213 LASSERT(flags & FMODE_READ);
214 och_p = &lli->lli_mds_read_och;
215 och_usecount = &lli->lli_open_fd_read_count;
218 mutex_lock(&lli->lli_och_mutex);
219 if (*och_usecount) { /* There are still users of this handle, so
221 mutex_unlock(&lli->lli_och_mutex);
226 mutex_unlock(&lli->lli_och_mutex);
228 if (och) { /* There might be a race and somebody have freed this och
230 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
237 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
240 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
241 struct ll_inode_info *lli = ll_i2info(inode);
245 /* clear group lock, if present */
246 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
247 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
249 /* Let's see if we have good enough OPEN lock on the file and if
250 we can skip talking to MDS */
251 if (file->f_dentry->d_inode) { /* Can this ever be false? */
253 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
254 struct lustre_handle lockh;
255 struct inode *inode = file->f_dentry->d_inode;
256 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
258 mutex_lock(&lli->lli_och_mutex);
259 if (fd->fd_omode & FMODE_WRITE) {
261 LASSERT(lli->lli_open_fd_write_count);
262 lli->lli_open_fd_write_count--;
263 } else if (fd->fd_omode & FMODE_EXEC) {
265 LASSERT(lli->lli_open_fd_exec_count);
266 lli->lli_open_fd_exec_count--;
269 LASSERT(lli->lli_open_fd_read_count);
270 lli->lli_open_fd_read_count--;
272 mutex_unlock(&lli->lli_och_mutex);
274 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
275 LDLM_IBITS, &policy, lockmode,
277 rc = ll_md_real_close(file->f_dentry->d_inode,
281 CERROR("Releasing a file %p with negative dentry %p. Name %s",
282 file, file->f_dentry, file->f_dentry->d_name.name);
285 LUSTRE_FPRIVATE(file) = NULL;
286 ll_file_data_put(fd);
287 ll_capa_close(inode);
292 /* While this returns an error code, fput() the caller does not, so we need
293 * to make every effort to clean up all of our state here. Also, applications
294 * rarely check close errors and even if an error is returned they will not
295 * re-try the close call.
297 int ll_file_release(struct inode *inode, struct file *file)
299 struct ll_file_data *fd;
300 struct ll_sb_info *sbi = ll_i2sbi(inode);
301 struct ll_inode_info *lli = ll_i2info(inode);
305 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
306 inode->i_generation, inode);
308 #ifdef CONFIG_FS_POSIX_ACL
309 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
310 inode == inode->i_sb->s_root->d_inode) {
311 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
314 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
315 fd->fd_flags &= ~LL_FILE_RMTACL;
316 rct_del(&sbi->ll_rct, cfs_curproc_pid());
317 et_search_free(&sbi->ll_et, cfs_curproc_pid());
322 if (inode->i_sb->s_root != file->f_dentry)
323 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
324 fd = LUSTRE_FPRIVATE(file);
327 /* The last ref on @file, maybe not the the owner pid of statahead.
328 * Different processes can open the same dir, "ll_opendir_key" means:
329 * it is me that should stop the statahead thread. */
330 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
331 lli->lli_opendir_pid != 0)
332 ll_stop_statahead(inode, lli->lli_opendir_key);
334 if (inode->i_sb->s_root == file->f_dentry) {
335 LUSTRE_FPRIVATE(file) = NULL;
336 ll_file_data_put(fd);
340 if (!S_ISDIR(inode->i_mode)) {
341 lov_read_and_clear_async_rc(lli->lli_clob);
342 lli->lli_async_rc = 0;
345 rc = ll_md_close(sbi->ll_md_exp, inode, file);
347 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
348 libcfs_debug_dumplog();
353 static int ll_intent_file_open(struct file *file, void *lmm,
354 int lmmsize, struct lookup_intent *itp)
356 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
357 struct dentry *parent = file->f_dentry->d_parent;
358 const char *name = file->f_dentry->d_name.name;
359 const int len = file->f_dentry->d_name.len;
360 struct md_op_data *op_data;
361 struct ptlrpc_request *req;
362 __u32 opc = LUSTRE_OPC_ANY;
369 /* Usually we come here only for NFSD, and we want open lock.
370 But we can also get here with pre 2.6.15 patchless kernels, and in
371 that case that lock is also ok */
372 /* We can also get here if there was cached open handle in revalidate_it
373 * but it disappeared while we were getting from there to ll_file_open.
374 * But this means this file was closed and immediatelly opened which
375 * makes a good candidate for using OPEN lock */
376 /* If lmmsize & lmm are not 0, we are just setting stripe info
377 * parameters. No need for the open lock */
378 if (lmm == NULL && lmmsize == 0) {
379 itp->it_flags |= MDS_OPEN_LOCK;
380 if (itp->it_flags & FMODE_WRITE)
381 opc = LUSTRE_OPC_CREATE;
384 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
385 file->f_dentry->d_inode, name, len,
388 RETURN(PTR_ERR(op_data));
390 itp->it_flags |= MDS_OPEN_BY_FID;
391 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
392 0 /*unused */, &req, ll_md_blocking_ast, 0);
393 ll_finish_md_op_data(op_data);
395 /* reason for keep own exit path - don`t flood log
396 * with messages with -ESTALE errors.
398 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
399 it_open_error(DISP_OPEN_OPEN, itp))
401 ll_release_openhandle(file->f_dentry, itp);
405 if (it_disposition(itp, DISP_LOOKUP_NEG))
406 GOTO(out, rc = -ENOENT);
408 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
409 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
410 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
414 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
415 if (!rc && itp->d.lustre.it_lock_mode)
416 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
420 ptlrpc_req_finished(itp->d.lustre.it_data);
421 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
422 ll_intent_drop_lock(itp);
428 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
429 * not believe attributes if a few ioepoch holders exist. Attributes for
430 * previous ioepoch if new one is opened are also skipped by MDS.
432 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
434 if (ioepoch && lli->lli_ioepoch != ioepoch) {
435 lli->lli_ioepoch = ioepoch;
436 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
437 ioepoch, PFID(&lli->lli_fid));
441 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
442 struct lookup_intent *it, struct obd_client_handle *och)
444 struct ptlrpc_request *req = it->d.lustre.it_data;
445 struct mdt_body *body;
449 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
450 LASSERT(body != NULL); /* reply already checked out */
452 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
453 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
454 och->och_fid = lli->lli_fid;
455 och->och_flags = it->it_flags;
456 ll_ioepoch_open(lli, body->ioepoch);
458 return md_set_open_replay_data(md_exp, och, req);
461 int ll_local_open(struct file *file, struct lookup_intent *it,
462 struct ll_file_data *fd, struct obd_client_handle *och)
464 struct inode *inode = file->f_dentry->d_inode;
465 struct ll_inode_info *lli = ll_i2info(inode);
468 LASSERT(!LUSTRE_FPRIVATE(file));
473 struct ptlrpc_request *req = it->d.lustre.it_data;
474 struct mdt_body *body;
477 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
481 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
482 if ((it->it_flags & FMODE_WRITE) &&
483 (body->valid & OBD_MD_FLSIZE))
484 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
485 lli->lli_ioepoch, PFID(&lli->lli_fid));
488 LUSTRE_FPRIVATE(file) = fd;
489 ll_readahead_init(inode, &fd->fd_ras);
490 fd->fd_omode = it->it_flags;
494 /* Open a file, and (for the very first open) create objects on the OSTs at
495 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
496 * creation or open until ll_lov_setstripe() ioctl is called.
498 * If we already have the stripe MD locally then we don't request it in
499 * md_open(), by passing a lmm_size = 0.
501 * It is up to the application to ensure no other processes open this file
502 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
503 * used. We might be able to avoid races of that sort by getting lli_open_sem
504 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
505 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
507 int ll_file_open(struct inode *inode, struct file *file)
509 struct ll_inode_info *lli = ll_i2info(inode);
510 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
511 .it_flags = file->f_flags };
512 struct obd_client_handle **och_p = NULL;
513 __u64 *och_usecount = NULL;
514 struct ll_file_data *fd;
515 int rc = 0, opendir_set = 0;
518 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
519 inode->i_generation, inode, file->f_flags);
521 it = file->private_data; /* XXX: compat macro */
522 file->private_data = NULL; /* prevent ll_local_open assertion */
524 fd = ll_file_data_get();
526 GOTO(out_och_free, rc = -ENOMEM);
529 if (S_ISDIR(inode->i_mode)) {
530 spin_lock(&lli->lli_sa_lock);
531 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
532 lli->lli_opendir_pid == 0) {
533 lli->lli_opendir_key = fd;
534 lli->lli_opendir_pid = cfs_curproc_pid();
537 spin_unlock(&lli->lli_sa_lock);
540 if (inode->i_sb->s_root == file->f_dentry) {
541 LUSTRE_FPRIVATE(file) = fd;
545 if (!it || !it->d.lustre.it_disposition) {
546 /* Convert f_flags into access mode. We cannot use file->f_mode,
547 * because everything but O_ACCMODE mask was stripped from
549 if ((oit.it_flags + 1) & O_ACCMODE)
551 if (file->f_flags & O_TRUNC)
552 oit.it_flags |= FMODE_WRITE;
554 /* kernel only call f_op->open in dentry_open. filp_open calls
555 * dentry_open after call to open_namei that checks permissions.
556 * Only nfsd_open call dentry_open directly without checking
557 * permissions and because of that this code below is safe. */
558 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
559 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
561 /* We do not want O_EXCL here, presumably we opened the file
562 * already? XXX - NFS implications? */
563 oit.it_flags &= ~O_EXCL;
565 /* bug20584, if "it_flags" contains O_CREAT, the file will be
566 * created if necessary, then "IT_CREAT" should be set to keep
567 * consistent with it */
568 if (oit.it_flags & O_CREAT)
569 oit.it_op |= IT_CREAT;
575 /* Let's see if we have file open on MDS already. */
576 if (it->it_flags & FMODE_WRITE) {
577 och_p = &lli->lli_mds_write_och;
578 och_usecount = &lli->lli_open_fd_write_count;
579 } else if (it->it_flags & FMODE_EXEC) {
580 och_p = &lli->lli_mds_exec_och;
581 och_usecount = &lli->lli_open_fd_exec_count;
583 och_p = &lli->lli_mds_read_och;
584 och_usecount = &lli->lli_open_fd_read_count;
587 mutex_lock(&lli->lli_och_mutex);
588 if (*och_p) { /* Open handle is present */
589 if (it_disposition(it, DISP_OPEN_OPEN)) {
590 /* Well, there's extra open request that we do not need,
591 let's close it somehow. This will decref request. */
592 rc = it_open_error(DISP_OPEN_OPEN, it);
594 mutex_unlock(&lli->lli_och_mutex);
595 GOTO(out_openerr, rc);
598 ll_release_openhandle(file->f_dentry, it);
602 rc = ll_local_open(file, it, fd, NULL);
605 mutex_unlock(&lli->lli_och_mutex);
606 GOTO(out_openerr, rc);
609 LASSERT(*och_usecount == 0);
610 if (!it->d.lustre.it_disposition) {
611 /* We cannot just request lock handle now, new ELC code
612 means that one of other OPEN locks for this file
613 could be cancelled, and since blocking ast handler
614 would attempt to grab och_mutex as well, that would
615 result in a deadlock */
616 mutex_unlock(&lli->lli_och_mutex);
617 it->it_create_mode |= M_CHECK_STALE;
618 rc = ll_intent_file_open(file, NULL, 0, it);
619 it->it_create_mode &= ~M_CHECK_STALE;
621 GOTO(out_openerr, rc);
625 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
627 GOTO(out_och_free, rc = -ENOMEM);
631 /* md_intent_lock() didn't get a request ref if there was an
632 * open error, so don't do cleanup on the request here
634 /* XXX (green): Should not we bail out on any error here, not
635 * just open error? */
636 rc = it_open_error(DISP_OPEN_OPEN, it);
638 GOTO(out_och_free, rc);
640 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
642 rc = ll_local_open(file, it, fd, *och_p);
644 GOTO(out_och_free, rc);
646 mutex_unlock(&lli->lli_och_mutex);
649 /* Must do this outside lli_och_mutex lock to prevent deadlock where
650 different kind of OPEN lock for this same inode gets cancelled
651 by ldlm_cancel_lru */
652 if (!S_ISREG(inode->i_mode))
653 GOTO(out_och_free, rc);
657 if (!lli->lli_has_smd) {
658 if (file->f_flags & O_LOV_DELAY_CREATE ||
659 !(file->f_mode & FMODE_WRITE)) {
660 CDEBUG(D_INODE, "object creation was delayed\n");
661 GOTO(out_och_free, rc);
664 file->f_flags &= ~O_LOV_DELAY_CREATE;
665 GOTO(out_och_free, rc);
669 if (och_p && *och_p) {
670 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
671 *och_p = NULL; /* OBD_FREE writes some magic there */
674 mutex_unlock(&lli->lli_och_mutex);
677 if (opendir_set != 0)
678 ll_stop_statahead(inode, lli->lli_opendir_key);
680 ll_file_data_put(fd);
682 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
685 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
686 ptlrpc_req_finished(it->d.lustre.it_data);
687 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
693 /* Fills the obdo with the attributes for the lsm */
694 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
695 struct obd_capa *capa, struct obdo *obdo,
696 __u64 ioepoch, int sync)
698 struct ptlrpc_request_set *set;
699 struct obd_info oinfo = { { { 0 } } };
704 LASSERT(lsm != NULL);
708 oinfo.oi_oa->o_id = lsm->lsm_object_id;
709 oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
710 oinfo.oi_oa->o_mode = S_IFREG;
711 oinfo.oi_oa->o_ioepoch = ioepoch;
712 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
713 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
714 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
715 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
716 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
717 OBD_MD_FLDATAVERSION;
718 oinfo.oi_capa = capa;
720 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
721 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
724 set = ptlrpc_prep_set();
726 CERROR("can't allocate ptlrpc set\n");
729 rc = obd_getattr_async(exp, &oinfo, set);
731 rc = ptlrpc_set_wait(set);
732 ptlrpc_set_destroy(set);
735 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
736 OBD_MD_FLATIME | OBD_MD_FLMTIME |
737 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
738 OBD_MD_FLDATAVERSION);
743 * Performs the getattr on the inode and updates its fields.
744 * If @sync != 0, perform the getattr under the server-side lock.
746 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
747 __u64 ioepoch, int sync)
749 struct obd_capa *capa = ll_mdscapa_get(inode);
750 struct lov_stripe_md *lsm;
754 lsm = ccc_inode_lsm_get(inode);
755 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
756 capa, obdo, ioepoch, sync);
759 obdo_refresh_inode(inode, obdo, obdo->o_valid);
761 "objid "LPX64" size %llu, blocks %llu, blksize %lu\n",
762 lsm ? lsm->lsm_object_id : 0, i_size_read(inode),
763 (unsigned long long)inode->i_blocks,
764 (unsigned long)ll_inode_blksize(inode));
766 ccc_inode_lsm_put(inode, lsm);
770 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
772 struct ll_inode_info *lli = ll_i2info(inode);
773 struct cl_object *obj = lli->lli_clob;
774 struct cl_attr *attr = ccc_env_thread_attr(env);
780 ll_inode_size_lock(inode);
781 /* merge timestamps the most recently obtained from mds with
782 timestamps obtained from osts */
783 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
784 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
785 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
786 inode_init_lvb(inode, &lvb);
788 cl_object_attr_lock(obj);
789 rc = cl_object_attr_get(env, obj, attr);
790 cl_object_attr_unlock(obj);
793 if (lvb.lvb_atime < attr->cat_atime)
794 lvb.lvb_atime = attr->cat_atime;
795 if (lvb.lvb_ctime < attr->cat_ctime)
796 lvb.lvb_ctime = attr->cat_ctime;
797 if (lvb.lvb_mtime < attr->cat_mtime)
798 lvb.lvb_mtime = attr->cat_mtime;
800 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
801 PFID(&lli->lli_fid), attr->cat_size);
802 cl_isize_write_nolock(inode, attr->cat_size);
804 inode->i_blocks = attr->cat_blocks;
806 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
807 LTIME_S(inode->i_atime) = lvb.lvb_atime;
808 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
810 ll_inode_size_unlock(inode);
815 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
818 struct obdo obdo = { 0 };
821 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
823 st->st_size = obdo.o_size;
824 st->st_blocks = obdo.o_blocks;
825 st->st_mtime = obdo.o_mtime;
826 st->st_atime = obdo.o_atime;
827 st->st_ctime = obdo.o_ctime;
832 void ll_io_init(struct cl_io *io, const struct file *file, int write)
834 struct inode *inode = file->f_dentry->d_inode;
836 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
838 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
839 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
840 file->f_flags & O_DIRECT ||
843 io->ci_obj = ll_i2info(inode)->lli_clob;
844 io->ci_lockreq = CILR_MAYBE;
845 if (ll_file_nolock(file)) {
846 io->ci_lockreq = CILR_NEVER;
847 io->ci_no_srvlock = 1;
848 } else if (file->f_flags & O_APPEND) {
849 io->ci_lockreq = CILR_MANDATORY;
854 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
855 struct file *file, enum cl_io_type iot,
856 loff_t *ppos, size_t count)
858 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
859 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
865 io = ccc_env_thread_io(env);
866 ll_io_init(io, file, iot == CIT_WRITE);
868 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
869 struct vvp_io *vio = vvp_env_io(env);
870 struct ccc_io *cio = ccc_env_io(env);
871 int write_mutex_locked = 0;
873 cio->cui_fd = LUSTRE_FPRIVATE(file);
874 vio->cui_io_subtype = args->via_io_subtype;
876 switch (vio->cui_io_subtype) {
878 cio->cui_iov = args->u.normal.via_iov;
879 cio->cui_nrsegs = args->u.normal.via_nrsegs;
880 cio->cui_tot_nrsegs = cio->cui_nrsegs;
881 #ifndef HAVE_FILE_WRITEV
882 cio->cui_iocb = args->u.normal.via_iocb;
884 if ((iot == CIT_WRITE) &&
885 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
886 if (mutex_lock_interruptible(&lli->
888 GOTO(out, result = -ERESTARTSYS);
889 write_mutex_locked = 1;
890 } else if (iot == CIT_READ) {
891 down_read(&lli->lli_trunc_sem);
895 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
896 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
899 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
900 vio->u.splice.cui_flags = args->u.splice.via_flags;
903 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
906 result = cl_io_loop(env, io);
907 if (write_mutex_locked)
908 mutex_unlock(&lli->lli_write_mutex);
909 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
910 up_read(&lli->lli_trunc_sem);
912 /* cl_io_rw_init() handled IO */
913 result = io->ci_result;
916 if (io->ci_nob > 0) {
918 *ppos = io->u.ci_wr.wr.crw_pos;
923 if (result == 0 && io->ci_need_restart) /* need to restart whole IO */
926 if (iot == CIT_READ) {
928 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
929 LPROC_LL_READ_BYTES, result);
930 } else if (iot == CIT_WRITE) {
932 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
933 LPROC_LL_WRITE_BYTES, result);
934 fd->fd_write_failed = false;
935 } else if (result != -ERESTARTSYS) {
936 fd->fd_write_failed = true;
945 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
947 static int ll_file_get_iov_count(const struct iovec *iov,
948 unsigned long *nr_segs, size_t *count)
953 for (seg = 0; seg < *nr_segs; seg++) {
954 const struct iovec *iv = &iov[seg];
957 * If any segment has a negative length, or the cumulative
958 * length ever wraps negative then return -EINVAL.
961 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
963 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
968 cnt -= iv->iov_len; /* This segment is no good */
975 #ifdef HAVE_FILE_READV
976 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
977 unsigned long nr_segs, loff_t *ppos)
980 struct vvp_io_args *args;
986 result = ll_file_get_iov_count(iov, &nr_segs, &count);
990 env = cl_env_get(&refcheck);
992 RETURN(PTR_ERR(env));
994 args = vvp_env_args(env, IO_NORMAL);
995 args->u.normal.via_iov = (struct iovec *)iov;
996 args->u.normal.via_nrsegs = nr_segs;
998 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
999 cl_env_put(env, &refcheck);
1003 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1007 struct iovec *local_iov;
1012 env = cl_env_get(&refcheck);
1014 RETURN(PTR_ERR(env));
1016 local_iov = &vvp_env_info(env)->vti_local_iov;
1017 local_iov->iov_base = (void __user *)buf;
1018 local_iov->iov_len = count;
1019 result = ll_file_readv(file, local_iov, 1, ppos);
1020 cl_env_put(env, &refcheck);
1025 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1026 unsigned long nr_segs, loff_t pos)
1029 struct vvp_io_args *args;
1035 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1039 env = cl_env_get(&refcheck);
1041 RETURN(PTR_ERR(env));
1043 args = vvp_env_args(env, IO_NORMAL);
1044 args->u.normal.via_iov = (struct iovec *)iov;
1045 args->u.normal.via_nrsegs = nr_segs;
1046 args->u.normal.via_iocb = iocb;
1048 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1049 &iocb->ki_pos, count);
1050 cl_env_put(env, &refcheck);
1054 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1058 struct iovec *local_iov;
1059 struct kiocb *kiocb;
1064 env = cl_env_get(&refcheck);
1066 RETURN(PTR_ERR(env));
1068 local_iov = &vvp_env_info(env)->vti_local_iov;
1069 kiocb = &vvp_env_info(env)->vti_kiocb;
1070 local_iov->iov_base = (void __user *)buf;
1071 local_iov->iov_len = count;
1072 init_sync_kiocb(kiocb, file);
1073 kiocb->ki_pos = *ppos;
1074 kiocb->ki_left = count;
1076 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1077 *ppos = kiocb->ki_pos;
1079 cl_env_put(env, &refcheck);
1085 * Write to a file (through the page cache).
1087 #ifdef HAVE_FILE_WRITEV
1088 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1089 unsigned long nr_segs, loff_t *ppos)
1092 struct vvp_io_args *args;
1098 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1102 env = cl_env_get(&refcheck);
1104 RETURN(PTR_ERR(env));
1106 args = vvp_env_args(env, IO_NORMAL);
1107 args->u.normal.via_iov = (struct iovec *)iov;
1108 args->u.normal.via_nrsegs = nr_segs;
1110 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1111 cl_env_put(env, &refcheck);
1115 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1119 struct iovec *local_iov;
1124 env = cl_env_get(&refcheck);
1126 RETURN(PTR_ERR(env));
1128 local_iov = &vvp_env_info(env)->vti_local_iov;
1129 local_iov->iov_base = (void __user *)buf;
1130 local_iov->iov_len = count;
1132 result = ll_file_writev(file, local_iov, 1, ppos);
1133 cl_env_put(env, &refcheck);
1137 #else /* AIO stuff */
1138 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1139 unsigned long nr_segs, loff_t pos)
1142 struct vvp_io_args *args;
1148 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1152 env = cl_env_get(&refcheck);
1154 RETURN(PTR_ERR(env));
1156 args = vvp_env_args(env, IO_NORMAL);
1157 args->u.normal.via_iov = (struct iovec *)iov;
1158 args->u.normal.via_nrsegs = nr_segs;
1159 args->u.normal.via_iocb = iocb;
1161 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1162 &iocb->ki_pos, count);
1163 cl_env_put(env, &refcheck);
1167 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1171 struct iovec *local_iov;
1172 struct kiocb *kiocb;
1177 env = cl_env_get(&refcheck);
1179 RETURN(PTR_ERR(env));
1181 local_iov = &vvp_env_info(env)->vti_local_iov;
1182 kiocb = &vvp_env_info(env)->vti_kiocb;
1183 local_iov->iov_base = (void __user *)buf;
1184 local_iov->iov_len = count;
1185 init_sync_kiocb(kiocb, file);
1186 kiocb->ki_pos = *ppos;
1187 kiocb->ki_left = count;
1189 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1190 *ppos = kiocb->ki_pos;
1192 cl_env_put(env, &refcheck);
1198 #ifdef HAVE_KERNEL_SENDFILE
1200 * Send file content (through pagecache) somewhere with helper
1202 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1203 read_actor_t actor, void *target)
1206 struct vvp_io_args *args;
1211 env = cl_env_get(&refcheck);
1213 RETURN(PTR_ERR(env));
1215 args = vvp_env_args(env, IO_SENDFILE);
1216 args->u.sendfile.via_target = target;
1217 args->u.sendfile.via_actor = actor;
1219 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1220 cl_env_put(env, &refcheck);
1225 #ifdef HAVE_KERNEL_SPLICE_READ
1227 * Send file content (through pagecache) somewhere with helper
1229 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1230 struct pipe_inode_info *pipe, size_t count,
1234 struct vvp_io_args *args;
1239 env = cl_env_get(&refcheck);
1241 RETURN(PTR_ERR(env));
1243 args = vvp_env_args(env, IO_SPLICE);
1244 args->u.splice.via_pipe = pipe;
1245 args->u.splice.via_flags = flags;
1247 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1248 cl_env_put(env, &refcheck);
1253 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1256 struct obd_export *exp = ll_i2dtexp(inode);
1257 struct obd_trans_info oti = { 0 };
1258 struct obdo *oa = NULL;
1261 struct lov_stripe_md *lsm = NULL, *lsm2;
1268 lsm = ccc_inode_lsm_get(inode);
1270 GOTO(out, rc = -ENOENT);
1272 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1273 (lsm->lsm_stripe_count));
1275 OBD_ALLOC_LARGE(lsm2, lsm_size);
1277 GOTO(out, rc = -ENOMEM);
1281 oa->o_nlink = ost_idx;
1282 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1283 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1284 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1285 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1286 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1287 memcpy(lsm2, lsm, lsm_size);
1288 ll_inode_size_lock(inode);
1289 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1290 ll_inode_size_unlock(inode);
1292 OBD_FREE_LARGE(lsm2, lsm_size);
1295 ccc_inode_lsm_put(inode, lsm);
1300 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1302 struct ll_recreate_obj ucreat;
1305 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1308 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1312 RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1313 ucreat.lrc_ost_idx));
1316 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1323 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1326 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1329 id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1330 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1331 RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1334 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1335 int flags, struct lov_user_md *lum, int lum_size)
1337 struct lov_stripe_md *lsm = NULL;
1338 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1342 lsm = ccc_inode_lsm_get(inode);
1344 ccc_inode_lsm_put(inode, lsm);
1345 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1350 ll_inode_size_lock(inode);
1351 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1354 rc = oit.d.lustre.it_status;
1356 GOTO(out_req_free, rc);
1358 ll_release_openhandle(file->f_dentry, &oit);
1361 ll_inode_size_unlock(inode);
1362 ll_intent_release(&oit);
1363 ccc_inode_lsm_put(inode, lsm);
1366 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1370 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1371 struct lov_mds_md **lmmp, int *lmm_size,
1372 struct ptlrpc_request **request)
1374 struct ll_sb_info *sbi = ll_i2sbi(inode);
1375 struct mdt_body *body;
1376 struct lov_mds_md *lmm = NULL;
1377 struct ptlrpc_request *req = NULL;
1378 struct md_op_data *op_data;
1381 rc = ll_get_max_mdsize(sbi, &lmmsize);
1385 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1386 strlen(filename), lmmsize,
1387 LUSTRE_OPC_ANY, NULL);
1388 if (IS_ERR(op_data))
1389 RETURN(PTR_ERR(op_data));
1391 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1392 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1393 ll_finish_md_op_data(op_data);
1395 CDEBUG(D_INFO, "md_getattr_name failed "
1396 "on %s: rc %d\n", filename, rc);
1400 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1401 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1403 lmmsize = body->eadatasize;
1405 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1407 GOTO(out, rc = -ENODATA);
1410 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1411 LASSERT(lmm != NULL);
1413 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1414 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1415 GOTO(out, rc = -EPROTO);
1419 * This is coming from the MDS, so is probably in
1420 * little endian. We convert it to host endian before
1421 * passing it to userspace.
1423 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1424 /* if function called for directory - we should
1425 * avoid swab not existent lsm objects */
1426 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1427 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1428 if (S_ISREG(body->mode))
1429 lustre_swab_lov_user_md_objects(
1430 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1431 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1432 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1433 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1434 if (S_ISREG(body->mode))
1435 lustre_swab_lov_user_md_objects(
1436 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1437 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1443 *lmm_size = lmmsize;
1448 static int ll_lov_setea(struct inode *inode, struct file *file,
1451 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1452 struct lov_user_md *lump;
1453 int lum_size = sizeof(struct lov_user_md) +
1454 sizeof(struct lov_user_ost_data);
1458 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1461 OBD_ALLOC_LARGE(lump, lum_size);
1465 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1466 OBD_FREE_LARGE(lump, lum_size);
1470 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1472 OBD_FREE_LARGE(lump, lum_size);
1476 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1479 struct lov_user_md_v3 lumv3;
1480 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1481 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1482 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1484 int flags = FMODE_WRITE;
1487 /* first try with v1 which is smaller than v3 */
1488 lum_size = sizeof(struct lov_user_md_v1);
1489 if (copy_from_user(lumv1, lumv1p, lum_size))
1492 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1493 lum_size = sizeof(struct lov_user_md_v3);
1494 if (copy_from_user(&lumv3, lumv3p, lum_size))
1498 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1500 struct lov_stripe_md *lsm;
1503 put_user(0, &lumv1p->lmm_stripe_count);
1505 ll_layout_refresh(inode, &gen);
1506 lsm = ccc_inode_lsm_get(inode);
1507 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1508 0, lsm, (void *)arg);
1509 ccc_inode_lsm_put(inode, lsm);
1514 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1516 struct lov_stripe_md *lsm;
1520 lsm = ccc_inode_lsm_get(inode);
1522 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1524 ccc_inode_lsm_put(inode, lsm);
1528 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1530 struct ll_inode_info *lli = ll_i2info(inode);
1531 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1532 struct ccc_grouplock grouplock;
1536 if (ll_file_nolock(file))
1537 RETURN(-EOPNOTSUPP);
1539 spin_lock(&lli->lli_lock);
1540 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1541 CWARN("group lock already existed with gid %lu\n",
1542 fd->fd_grouplock.cg_gid);
1543 spin_unlock(&lli->lli_lock);
1546 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1547 spin_unlock(&lli->lli_lock);
1549 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1550 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1554 spin_lock(&lli->lli_lock);
1555 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1556 spin_unlock(&lli->lli_lock);
1557 CERROR("another thread just won the race\n");
1558 cl_put_grouplock(&grouplock);
1562 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1563 fd->fd_grouplock = grouplock;
1564 spin_unlock(&lli->lli_lock);
1566 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1570 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1572 struct ll_inode_info *lli = ll_i2info(inode);
1573 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1574 struct ccc_grouplock grouplock;
1577 spin_lock(&lli->lli_lock);
1578 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1579 spin_unlock(&lli->lli_lock);
1580 CWARN("no group lock held\n");
1583 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1585 if (fd->fd_grouplock.cg_gid != arg) {
1586 CWARN("group lock %lu doesn't match current id %lu\n",
1587 arg, fd->fd_grouplock.cg_gid);
1588 spin_unlock(&lli->lli_lock);
1592 grouplock = fd->fd_grouplock;
1593 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1594 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1595 spin_unlock(&lli->lli_lock);
1597 cl_put_grouplock(&grouplock);
1598 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1603 * Close inode open handle
1605 * \param dentry [in] dentry which contains the inode
1606 * \param it [in,out] intent which contains open info and result
1609 * \retval <0 failure
1611 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1613 struct inode *inode = dentry->d_inode;
1614 struct obd_client_handle *och;
1620 /* Root ? Do nothing. */
1621 if (dentry->d_inode->i_sb->s_root == dentry)
1624 /* No open handle to close? Move away */
1625 if (!it_disposition(it, DISP_OPEN_OPEN))
1628 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1630 OBD_ALLOC(och, sizeof(*och));
1632 GOTO(out, rc = -ENOMEM);
1634 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1635 ll_i2info(inode), it, och);
1637 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1640 /* this one is in place of ll_file_open */
1641 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1642 ptlrpc_req_finished(it->d.lustre.it_data);
1643 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1649 * Get size for inode for which FIEMAP mapping is requested.
1650 * Make the FIEMAP get_info call and returns the result.
1652 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1655 struct obd_export *exp = ll_i2dtexp(inode);
1656 struct lov_stripe_md *lsm = NULL;
1657 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1658 int vallen = num_bytes;
1662 /* Checks for fiemap flags */
1663 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1664 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1668 /* Check for FIEMAP_FLAG_SYNC */
1669 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1670 rc = filemap_fdatawrite(inode->i_mapping);
1675 lsm = ccc_inode_lsm_get(inode);
1679 /* If the stripe_count > 1 and the application does not understand
1680 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1682 if (lsm->lsm_stripe_count > 1 &&
1683 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1684 GOTO(out, rc = -EOPNOTSUPP);
1686 fm_key.oa.o_id = lsm->lsm_object_id;
1687 fm_key.oa.o_seq = lsm->lsm_object_seq;
1688 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1690 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1691 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1692 /* If filesize is 0, then there would be no objects for mapping */
1693 if (fm_key.oa.o_size == 0) {
1694 fiemap->fm_mapped_extents = 0;
1698 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1700 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1703 CERROR("obd_get_info failed: rc = %d\n", rc);
1706 ccc_inode_lsm_put(inode, lsm);
1710 int ll_fid2path(struct inode *inode, void *arg)
1712 struct obd_export *exp = ll_i2mdexp(inode);
1713 struct getinfo_fid2path *gfout, *gfin;
1717 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1718 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1721 /* Need to get the buflen */
1722 OBD_ALLOC_PTR(gfin);
1725 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1730 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1731 OBD_ALLOC(gfout, outsize);
1732 if (gfout == NULL) {
1736 memcpy(gfout, gfin, sizeof(*gfout));
1739 /* Call mdc_iocontrol */
1740 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1743 if (copy_to_user(arg, gfout, outsize))
1747 OBD_FREE(gfout, outsize);
1751 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1753 struct ll_user_fiemap *fiemap_s;
1754 size_t num_bytes, ret_bytes;
1755 unsigned int extent_count;
1758 /* Get the extent count so we can calculate the size of
1759 * required fiemap buffer */
1760 if (get_user(extent_count,
1761 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1763 num_bytes = sizeof(*fiemap_s) + (extent_count *
1764 sizeof(struct ll_fiemap_extent));
1766 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1767 if (fiemap_s == NULL)
1770 /* get the fiemap value */
1771 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1773 GOTO(error, rc = -EFAULT);
1775 /* If fm_extent_count is non-zero, read the first extent since
1776 * it is used to calculate end_offset and device from previous
1779 if (copy_from_user(&fiemap_s->fm_extents[0],
1780 (char __user *)arg + sizeof(*fiemap_s),
1781 sizeof(struct ll_fiemap_extent)))
1782 GOTO(error, rc = -EFAULT);
1785 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1789 ret_bytes = sizeof(struct ll_user_fiemap);
1791 if (extent_count != 0)
1792 ret_bytes += (fiemap_s->fm_mapped_extents *
1793 sizeof(struct ll_fiemap_extent));
1795 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1799 OBD_FREE_LARGE(fiemap_s, num_bytes);
1804 * Read the data_version for inode.
1806 * This value is computed using stripe object version on OST.
1807 * Version is computed using server side locking.
1809 * @param extent_lock Take extent lock. Not needed if a process is already
1810 * holding the OST object group locks.
1812 int ll_data_version(struct inode *inode, __u64 *data_version,
1815 struct lov_stripe_md *lsm = NULL;
1816 struct ll_sb_info *sbi = ll_i2sbi(inode);
1817 struct obdo *obdo = NULL;
1821 /* If no stripe, we consider version is 0. */
1822 lsm = ccc_inode_lsm_get(inode);
1825 CDEBUG(D_INODE, "No object for inode\n");
1829 OBD_ALLOC_PTR(obdo);
1831 ccc_inode_lsm_put(inode, lsm);
1835 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1837 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1840 *data_version = obdo->o_data_version;
1844 ccc_inode_lsm_put(inode, lsm);
1849 static int ll_swap_layout(struct file *file, struct file *file2,
1850 struct lustre_swap_layouts *lsl)
1852 struct mdc_swap_layouts msl = { .msl_flags = lsl->sl_flags };
1853 struct md_op_data *op_data;
1854 struct inode *inode = file->f_dentry->d_inode;
1855 struct inode *inode2 = file2->f_dentry->d_inode;
1859 if (!S_ISREG(inode2->i_mode))
1862 if (inode_permission(inode, MAY_WRITE) ||
1863 inode_permission(inode2, MAY_WRITE))
1866 if (inode2->i_sb != inode->i_sb)
1869 rc = lu_fid_cmp(ll_inode2fid(inode), ll_inode2fid(inode2));
1870 if (rc == 0) /* same file, done! */
1873 if (rc < 0) { /* sequentialize it */
1874 swap(inode, inode2);
1879 if (gid != 0) { /* application asks to flush dirty cache */
1880 rc = ll_get_grouplock(inode, file, gid);
1884 rc = ll_get_grouplock(inode2, file2, gid);
1886 ll_put_grouplock(inode, file, gid);
1891 /* struct md_op_data is used to send the swap args to the mdt
1892 * only flags is missing, so we use struct mdc_swap_layouts
1893 * through the md_op_data->op_data */
1895 op_data = ll_prep_md_op_data(NULL, inode, inode2, NULL, 0, 0,
1896 LUSTRE_OPC_ANY, &msl);
1897 if (op_data != NULL) {
1898 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(inode),
1899 sizeof(*op_data), op_data, NULL);
1900 ll_finish_md_op_data(op_data);
1904 ll_put_grouplock(inode2, file2, gid);
1905 ll_put_grouplock(inode, file, gid);
1911 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1913 struct inode *inode = file->f_dentry->d_inode;
1914 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1918 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1919 inode->i_generation, inode, cmd);
1920 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1922 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1923 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1927 case LL_IOC_GETFLAGS:
1928 /* Get the current value of the file flags */
1929 return put_user(fd->fd_flags, (int *)arg);
1930 case LL_IOC_SETFLAGS:
1931 case LL_IOC_CLRFLAGS:
1932 /* Set or clear specific file flags */
1933 /* XXX This probably needs checks to ensure the flags are
1934 * not abused, and to handle any flag side effects.
1936 if (get_user(flags, (int *) arg))
1939 if (cmd == LL_IOC_SETFLAGS) {
1940 if ((flags & LL_FILE_IGNORE_LOCK) &&
1941 !(file->f_flags & O_DIRECT)) {
1942 CERROR("%s: unable to disable locking on "
1943 "non-O_DIRECT file\n", current->comm);
1947 fd->fd_flags |= flags;
1949 fd->fd_flags &= ~flags;
1952 case LL_IOC_LOV_SETSTRIPE:
1953 RETURN(ll_lov_setstripe(inode, file, arg));
1954 case LL_IOC_LOV_SETEA:
1955 RETURN(ll_lov_setea(inode, file, arg));
1956 case LL_IOC_LOV_SWAP_LAYOUTS: {
1958 struct lustre_swap_layouts lsl;
1960 if (cfs_copy_from_user(&lsl, (char *)arg,
1961 sizeof(struct lustre_swap_layouts)))
1964 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
1967 file2 = cfs_get_fd(lsl.sl_fd);
1972 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
1973 rc = ll_swap_layout(file, file2, &lsl);
1974 cfs_put_file(file2);
1977 case LL_IOC_LOV_GETSTRIPE:
1978 RETURN(ll_lov_getstripe(inode, arg));
1979 case LL_IOC_RECREATE_OBJ:
1980 RETURN(ll_lov_recreate_obj(inode, arg));
1981 case LL_IOC_RECREATE_FID:
1982 RETURN(ll_lov_recreate_fid(inode, arg));
1983 case FSFILT_IOC_FIEMAP:
1984 RETURN(ll_ioctl_fiemap(inode, arg));
1985 case FSFILT_IOC_GETFLAGS:
1986 case FSFILT_IOC_SETFLAGS:
1987 RETURN(ll_iocontrol(inode, file, cmd, arg));
1988 case FSFILT_IOC_GETVERSION_OLD:
1989 case FSFILT_IOC_GETVERSION:
1990 RETURN(put_user(inode->i_generation, (int *)arg));
1991 case LL_IOC_GROUP_LOCK:
1992 RETURN(ll_get_grouplock(inode, file, arg));
1993 case LL_IOC_GROUP_UNLOCK:
1994 RETURN(ll_put_grouplock(inode, file, arg));
1995 case IOC_OBD_STATFS:
1996 RETURN(ll_obd_statfs(inode, (void *)arg));
1998 /* We need to special case any other ioctls we want to handle,
1999 * to send them to the MDS/OST as appropriate and to properly
2000 * network encode the arg field.
2001 case FSFILT_IOC_SETVERSION_OLD:
2002 case FSFILT_IOC_SETVERSION:
2004 case LL_IOC_FLUSHCTX:
2005 RETURN(ll_flush_ctx(inode));
2006 case LL_IOC_PATH2FID: {
2007 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2008 sizeof(struct lu_fid)))
2013 case OBD_IOC_FID2PATH:
2014 RETURN(ll_fid2path(inode, (void *)arg));
2015 case LL_IOC_DATA_VERSION: {
2016 struct ioc_data_version idv;
2019 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2022 rc = ll_data_version(inode, &idv.idv_version,
2023 !(idv.idv_flags & LL_DV_NOFLUSH));
2025 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2031 case LL_IOC_GET_MDTIDX: {
2034 mdtidx = ll_get_mdt_idx(inode);
2038 if (put_user((int)mdtidx, (int*)arg))
2043 case OBD_IOC_GETDTNAME:
2044 case OBD_IOC_GETMDNAME:
2045 RETURN(ll_get_obd_name(inode, cmd, arg));
2046 case LL_IOC_HSM_STATE_GET: {
2047 struct md_op_data *op_data;
2048 struct hsm_user_state *hus;
2055 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2056 LUSTRE_OPC_ANY, hus);
2057 if (op_data == NULL) {
2062 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2065 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2068 ll_finish_md_op_data(op_data);
2072 case LL_IOC_HSM_STATE_SET: {
2073 struct md_op_data *op_data;
2074 struct hsm_state_set *hss;
2080 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2085 /* Non-root users are forbidden to set or clear flags which are
2086 * NOT defined in HSM_USER_MASK. */
2087 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2088 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2093 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2094 LUSTRE_OPC_ANY, hss);
2095 if (op_data == NULL) {
2100 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2103 ll_finish_md_op_data(op_data);
2108 case LL_IOC_HSM_ACTION: {
2109 struct md_op_data *op_data;
2110 struct hsm_current_action *hca;
2117 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2118 LUSTRE_OPC_ANY, hca);
2119 if (op_data == NULL) {
2124 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2127 if (cfs_copy_to_user((char *)arg, hca, sizeof(*hca)))
2130 ll_finish_md_op_data(op_data);
2138 ll_iocontrol_call(inode, file, cmd, arg, &err))
2141 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2147 #ifndef HAVE_FILE_LLSEEK_SIZE
2148 static inline loff_t
2149 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2151 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2153 if (offset > maxsize)
2156 if (offset != file->f_pos) {
2157 file->f_pos = offset;
2158 file->f_version = 0;
2164 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2165 loff_t maxsize, loff_t eof)
2167 struct inode *inode = file->f_dentry->d_inode;
2175 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2176 * position-querying operation. Avoid rewriting the "same"
2177 * f_pos value back to the file because a concurrent read(),
2178 * write() or lseek() might have altered it
2183 * f_lock protects against read/modify/write race with other
2184 * SEEK_CURs. Note that parallel writes and reads behave
2187 mutex_lock(&inode->i_mutex);
2188 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2189 mutex_unlock(&inode->i_mutex);
2193 * In the generic case the entire file is data, so as long as
2194 * offset isn't at the end of the file then the offset is data.
2201 * There is a virtual hole at the end of the file, so as long as
2202 * offset isn't i_size or larger, return i_size.
2210 return llseek_execute(file, offset, maxsize);
2214 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2216 struct inode *inode = file->f_dentry->d_inode;
2217 loff_t retval, eof = 0;
2220 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2221 (origin == SEEK_CUR) ? file->f_pos : 0);
2222 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2223 inode->i_ino, inode->i_generation, inode, retval, retval,
2225 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2227 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2228 retval = ll_glimpse_size(inode);
2231 eof = i_size_read(inode);
2234 retval = generic_file_llseek_size(file, offset, origin,
2235 ll_file_maxbytes(inode), eof);
2239 int ll_flush(struct file *file, fl_owner_t id)
2241 struct inode *inode = file->f_dentry->d_inode;
2242 struct ll_inode_info *lli = ll_i2info(inode);
2243 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2246 LASSERT(!S_ISDIR(inode->i_mode));
2248 /* catch async errors that were recorded back when async writeback
2249 * failed for pages in this mapping. */
2250 rc = lli->lli_async_rc;
2251 lli->lli_async_rc = 0;
2252 err = lov_read_and_clear_async_rc(lli->lli_clob);
2256 /* The application has been told write failure already.
2257 * Do not report failure again. */
2258 if (fd->fd_write_failed)
2260 return rc ? -EIO : 0;
2264 * Called to make sure a portion of file has been written out.
2265 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2267 * Return how many pages have been written.
2269 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2270 enum cl_fsync_mode mode)
2272 struct cl_env_nest nest;
2275 struct obd_capa *capa = NULL;
2276 struct cl_fsync_io *fio;
2280 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2281 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2284 env = cl_env_nested_get(&nest);
2286 RETURN(PTR_ERR(env));
2288 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2290 io = ccc_env_thread_io(env);
2291 io->ci_obj = cl_i2info(inode)->lli_clob;
2292 io->ci_ignore_layout = 1;
2294 /* initialize parameters for sync */
2295 fio = &io->u.ci_fsync;
2296 fio->fi_capa = capa;
2297 fio->fi_start = start;
2299 fio->fi_fid = ll_inode2fid(inode);
2300 fio->fi_mode = mode;
2301 fio->fi_nr_written = 0;
2303 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2304 result = cl_io_loop(env, io);
2306 result = io->ci_result;
2308 result = fio->fi_nr_written;
2309 cl_io_fini(env, io);
2310 cl_env_nested_put(&nest, env);
2317 #ifdef HAVE_FILE_FSYNC_4ARGS
2318 int ll_fsync(struct file *file, loff_t start, loff_t end, int data)
2319 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2320 int ll_fsync(struct file *file, int data)
2322 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2325 struct inode *inode = file->f_dentry->d_inode;
2326 struct ll_inode_info *lli = ll_i2info(inode);
2327 struct ptlrpc_request *req;
2328 struct obd_capa *oc;
2332 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2333 inode->i_generation, inode);
2334 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2336 #ifdef HAVE_FILE_FSYNC_4ARGS
2337 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2338 mutex_lock(&inode->i_mutex);
2340 /* fsync's caller has already called _fdata{sync,write}, we want
2341 * that IO to finish before calling the osc and mdc sync methods */
2342 rc = filemap_fdatawait(inode->i_mapping);
2345 /* catch async errors that were recorded back when async writeback
2346 * failed for pages in this mapping. */
2347 if (!S_ISDIR(inode->i_mode)) {
2348 err = lli->lli_async_rc;
2349 lli->lli_async_rc = 0;
2352 err = lov_read_and_clear_async_rc(lli->lli_clob);
2357 oc = ll_mdscapa_get(inode);
2358 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2364 ptlrpc_req_finished(req);
2367 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2369 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2371 if (rc == 0 && err < 0)
2374 fd->fd_write_failed = true;
2376 fd->fd_write_failed = false;
2379 #ifdef HAVE_FILE_FSYNC_4ARGS
2380 mutex_unlock(&inode->i_mutex);
2385 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2387 struct inode *inode = file->f_dentry->d_inode;
2388 struct ll_sb_info *sbi = ll_i2sbi(inode);
2389 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2390 .ei_cb_cp =ldlm_flock_completion_ast,
2391 .ei_cbdata = file_lock };
2392 struct md_op_data *op_data;
2393 struct lustre_handle lockh = {0};
2394 ldlm_policy_data_t flock = {{0}};
2400 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2401 inode->i_ino, file_lock);
2403 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2405 if (file_lock->fl_flags & FL_FLOCK) {
2406 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2407 /* flocks are whole-file locks */
2408 flock.l_flock.end = OFFSET_MAX;
2409 /* For flocks owner is determined by the local file desctiptor*/
2410 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2411 } else if (file_lock->fl_flags & FL_POSIX) {
2412 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2413 flock.l_flock.start = file_lock->fl_start;
2414 flock.l_flock.end = file_lock->fl_end;
2418 flock.l_flock.pid = file_lock->fl_pid;
2420 /* Somewhat ugly workaround for svc lockd.
2421 * lockd installs custom fl_lmops->lm_compare_owner that checks
2422 * for the fl_owner to be the same (which it always is on local node
2423 * I guess between lockd processes) and then compares pid.
2424 * As such we assign pid to the owner field to make it all work,
2425 * conflict with normal locks is unlikely since pid space and
2426 * pointer space for current->files are not intersecting */
2427 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2428 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2430 switch (file_lock->fl_type) {
2432 einfo.ei_mode = LCK_PR;
2435 /* An unlock request may or may not have any relation to
2436 * existing locks so we may not be able to pass a lock handle
2437 * via a normal ldlm_lock_cancel() request. The request may even
2438 * unlock a byte range in the middle of an existing lock. In
2439 * order to process an unlock request we need all of the same
2440 * information that is given with a normal read or write record
2441 * lock request. To avoid creating another ldlm unlock (cancel)
2442 * message we'll treat a LCK_NL flock request as an unlock. */
2443 einfo.ei_mode = LCK_NL;
2446 einfo.ei_mode = LCK_PW;
2449 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2450 file_lock->fl_type);
2465 flags = LDLM_FL_BLOCK_NOWAIT;
2471 flags = LDLM_FL_TEST_LOCK;
2472 /* Save the old mode so that if the mode in the lock changes we
2473 * can decrement the appropriate reader or writer refcount. */
2474 file_lock->fl_type = einfo.ei_mode;
2477 CERROR("unknown fcntl lock command: %d\n", cmd);
2481 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2482 LUSTRE_OPC_ANY, NULL);
2483 if (IS_ERR(op_data))
2484 RETURN(PTR_ERR(op_data));
2486 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2487 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2488 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2490 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2491 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2493 if ((file_lock->fl_flags & FL_FLOCK) &&
2494 (rc == 0 || file_lock->fl_type == F_UNLCK))
2495 rc2 = flock_lock_file_wait(file, file_lock);
2496 if ((file_lock->fl_flags & FL_POSIX) &&
2497 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2498 !(flags & LDLM_FL_TEST_LOCK))
2499 rc2 = posix_lock_file_wait(file, file_lock);
2501 if (rc2 && file_lock->fl_type != F_UNLCK) {
2502 einfo.ei_mode = LCK_NL;
2503 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2504 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2508 ll_finish_md_op_data(op_data);
2513 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2521 * test if some locks matching bits and l_req_mode are acquired
2522 * - bits can be in different locks
2523 * - if found clear the common lock bits in *bits
2524 * - the bits not found, are kept in *bits
2526 * \param bits [IN] searched lock bits [IN]
2527 * \param l_req_mode [IN] searched lock mode
2528 * \retval boolean, true iff all bits are found
2530 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2532 struct lustre_handle lockh;
2533 ldlm_policy_data_t policy;
2534 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2535 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2544 fid = &ll_i2info(inode)->lli_fid;
2545 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2546 ldlm_lockname[mode]);
2548 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2549 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2550 policy.l_inodebits.bits = *bits & (1 << i);
2551 if (policy.l_inodebits.bits == 0)
2554 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2555 &policy, mode, &lockh)) {
2556 struct ldlm_lock *lock;
2558 lock = ldlm_handle2lock(&lockh);
2561 ~(lock->l_policy_data.l_inodebits.bits);
2562 LDLM_LOCK_PUT(lock);
2564 *bits &= ~policy.l_inodebits.bits;
2571 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2572 struct lustre_handle *lockh, __u64 flags)
2574 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2579 fid = &ll_i2info(inode)->lli_fid;
2580 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2582 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2583 fid, LDLM_IBITS, &policy,
2584 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2588 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2590 /* Already unlinked. Just update nlink and return success */
2591 if (rc == -ENOENT) {
2593 /* This path cannot be hit for regular files unless in
2594 * case of obscure races, so no need to to validate
2596 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2598 } else if (rc != 0) {
2599 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2600 ll_get_fsname(inode->i_sb, NULL, 0),
2601 PFID(ll_inode2fid(inode)), rc);
2607 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2610 struct inode *inode = dentry->d_inode;
2611 struct ptlrpc_request *req = NULL;
2612 struct obd_export *exp;
2616 LASSERT(inode != NULL);
2618 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2619 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2621 exp = ll_i2mdexp(inode);
2623 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2624 * But under CMD case, it caused some lock issues, should be fixed
2625 * with new CMD ibits lock. See bug 12718 */
2626 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2627 struct lookup_intent oit = { .it_op = IT_GETATTR };
2628 struct md_op_data *op_data;
2630 if (ibits == MDS_INODELOCK_LOOKUP)
2631 oit.it_op = IT_LOOKUP;
2633 /* Call getattr by fid, so do not provide name at all. */
2634 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2635 dentry->d_inode, NULL, 0, 0,
2636 LUSTRE_OPC_ANY, NULL);
2637 if (IS_ERR(op_data))
2638 RETURN(PTR_ERR(op_data));
2640 oit.it_create_mode |= M_CHECK_STALE;
2641 rc = md_intent_lock(exp, op_data, NULL, 0,
2642 /* we are not interested in name
2645 ll_md_blocking_ast, 0);
2646 ll_finish_md_op_data(op_data);
2647 oit.it_create_mode &= ~M_CHECK_STALE;
2649 rc = ll_inode_revalidate_fini(inode, rc);
2653 rc = ll_revalidate_it_finish(req, &oit, dentry);
2655 ll_intent_release(&oit);
2659 /* Unlinked? Unhash dentry, so it is not picked up later by
2660 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2661 here to preserve get_cwd functionality on 2.6.
2663 if (!dentry->d_inode->i_nlink)
2664 d_lustre_invalidate(dentry);
2666 ll_lookup_finish_locks(&oit, dentry);
2667 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2668 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2669 obd_valid valid = OBD_MD_FLGETATTR;
2670 struct md_op_data *op_data;
2673 if (S_ISREG(inode->i_mode)) {
2674 rc = ll_get_max_mdsize(sbi, &ealen);
2677 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2680 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2681 0, ealen, LUSTRE_OPC_ANY,
2683 if (IS_ERR(op_data))
2684 RETURN(PTR_ERR(op_data));
2686 op_data->op_valid = valid;
2687 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2688 * capa for this inode. Because we only keep capas of dirs
2690 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2691 ll_finish_md_op_data(op_data);
2693 rc = ll_inode_revalidate_fini(inode, rc);
2697 rc = ll_prep_inode(&inode, req, NULL, NULL);
2700 ptlrpc_req_finished(req);
2704 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2707 struct inode *inode = dentry->d_inode;
2711 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2715 /* if object isn't regular file, don't validate size */
2716 if (!S_ISREG(inode->i_mode)) {
2717 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2718 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2719 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2721 rc = ll_glimpse_size(inode);
2726 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2727 struct lookup_intent *it, struct kstat *stat)
2729 struct inode *inode = de->d_inode;
2730 struct ll_sb_info *sbi = ll_i2sbi(inode);
2731 struct ll_inode_info *lli = ll_i2info(inode);
2734 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2735 MDS_INODELOCK_LOOKUP);
2736 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2741 stat->dev = inode->i_sb->s_dev;
2742 if (ll_need_32bit_api(sbi))
2743 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2745 stat->ino = inode->i_ino;
2746 stat->mode = inode->i_mode;
2747 stat->nlink = inode->i_nlink;
2748 stat->uid = inode->i_uid;
2749 stat->gid = inode->i_gid;
2750 stat->rdev = inode->i_rdev;
2751 stat->atime = inode->i_atime;
2752 stat->mtime = inode->i_mtime;
2753 stat->ctime = inode->i_ctime;
2754 stat->blksize = 1 << inode->i_blkbits;
2756 stat->size = i_size_read(inode);
2757 stat->blocks = inode->i_blocks;
2761 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2763 struct lookup_intent it = { .it_op = IT_GETATTR };
2765 return ll_getattr_it(mnt, de, &it, stat);
2768 #ifdef HAVE_LINUX_FIEMAP_H
2769 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2770 __u64 start, __u64 len)
2774 struct ll_user_fiemap *fiemap;
2775 unsigned int extent_count = fieinfo->fi_extents_max;
2777 num_bytes = sizeof(*fiemap) + (extent_count *
2778 sizeof(struct ll_fiemap_extent));
2779 OBD_ALLOC_LARGE(fiemap, num_bytes);
2784 fiemap->fm_flags = fieinfo->fi_flags;
2785 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2786 fiemap->fm_start = start;
2787 fiemap->fm_length = len;
2788 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2789 sizeof(struct ll_fiemap_extent));
2791 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2793 fieinfo->fi_flags = fiemap->fm_flags;
2794 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2795 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2796 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2798 OBD_FREE_LARGE(fiemap, num_bytes);
2803 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2805 struct ll_inode_info *lli = ll_i2info(inode);
2806 struct posix_acl *acl = NULL;
2809 spin_lock(&lli->lli_lock);
2810 /* VFS' acl_permission_check->check_acl will release the refcount */
2811 acl = posix_acl_dup(lli->lli_posix_acl);
2812 spin_unlock(&lli->lli_lock);
2817 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2819 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2820 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2822 ll_check_acl(struct inode *inode, int mask)
2825 # ifdef CONFIG_FS_POSIX_ACL
2826 struct posix_acl *acl;
2830 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2831 if (flags & IPERM_FLAG_RCU)
2834 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2839 rc = posix_acl_permission(inode, acl, mask);
2840 posix_acl_release(acl);
2843 # else /* !CONFIG_FS_POSIX_ACL */
2845 # endif /* CONFIG_FS_POSIX_ACL */
2847 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2849 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2850 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2852 # ifdef HAVE_INODE_PERMISION_2ARGS
2853 int ll_inode_permission(struct inode *inode, int mask)
2855 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2862 #ifdef MAY_NOT_BLOCK
2863 if (mask & MAY_NOT_BLOCK)
2865 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2866 if (flags & IPERM_FLAG_RCU)
2870 /* as root inode are NOT getting validated in lookup operation,
2871 * need to do it before permission check. */
2873 if (inode == inode->i_sb->s_root->d_inode) {
2874 struct lookup_intent it = { .it_op = IT_LOOKUP };
2876 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2877 MDS_INODELOCK_LOOKUP);
2882 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2883 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2885 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2886 return lustre_check_remote_perm(inode, mask);
2888 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2889 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
2894 #ifdef HAVE_FILE_READV
2895 #define READ_METHOD readv
2896 #define READ_FUNCTION ll_file_readv
2897 #define WRITE_METHOD writev
2898 #define WRITE_FUNCTION ll_file_writev
2900 #define READ_METHOD aio_read
2901 #define READ_FUNCTION ll_file_aio_read
2902 #define WRITE_METHOD aio_write
2903 #define WRITE_FUNCTION ll_file_aio_write
2906 /* -o localflock - only provides locally consistent flock locks */
2907 struct file_operations ll_file_operations = {
2908 .read = ll_file_read,
2909 .READ_METHOD = READ_FUNCTION,
2910 .write = ll_file_write,
2911 .WRITE_METHOD = WRITE_FUNCTION,
2912 .unlocked_ioctl = ll_file_ioctl,
2913 .open = ll_file_open,
2914 .release = ll_file_release,
2915 .mmap = ll_file_mmap,
2916 .llseek = ll_file_seek,
2917 #ifdef HAVE_KERNEL_SENDFILE
2918 .sendfile = ll_file_sendfile,
2920 #ifdef HAVE_KERNEL_SPLICE_READ
2921 .splice_read = ll_file_splice_read,
2927 struct file_operations ll_file_operations_flock = {
2928 .read = ll_file_read,
2929 .READ_METHOD = READ_FUNCTION,
2930 .write = ll_file_write,
2931 .WRITE_METHOD = WRITE_FUNCTION,
2932 .unlocked_ioctl = ll_file_ioctl,
2933 .open = ll_file_open,
2934 .release = ll_file_release,
2935 .mmap = ll_file_mmap,
2936 .llseek = ll_file_seek,
2937 #ifdef HAVE_KERNEL_SENDFILE
2938 .sendfile = ll_file_sendfile,
2940 #ifdef HAVE_KERNEL_SPLICE_READ
2941 .splice_read = ll_file_splice_read,
2945 .flock = ll_file_flock,
2946 .lock = ll_file_flock
2949 /* These are for -o noflock - to return ENOSYS on flock calls */
2950 struct file_operations ll_file_operations_noflock = {
2951 .read = ll_file_read,
2952 .READ_METHOD = READ_FUNCTION,
2953 .write = ll_file_write,
2954 .WRITE_METHOD = WRITE_FUNCTION,
2955 .unlocked_ioctl = ll_file_ioctl,
2956 .open = ll_file_open,
2957 .release = ll_file_release,
2958 .mmap = ll_file_mmap,
2959 .llseek = ll_file_seek,
2960 #ifdef HAVE_KERNEL_SENDFILE
2961 .sendfile = ll_file_sendfile,
2963 #ifdef HAVE_KERNEL_SPLICE_READ
2964 .splice_read = ll_file_splice_read,
2968 .flock = ll_file_noflock,
2969 .lock = ll_file_noflock
2972 struct inode_operations ll_file_inode_operations = {
2973 .setattr = ll_setattr,
2974 .getattr = ll_getattr,
2975 .permission = ll_inode_permission,
2976 .setxattr = ll_setxattr,
2977 .getxattr = ll_getxattr,
2978 .listxattr = ll_listxattr,
2979 .removexattr = ll_removexattr,
2980 #ifdef HAVE_LINUX_FIEMAP_H
2981 .fiemap = ll_fiemap,
2983 #ifdef HAVE_IOP_GET_ACL
2984 .get_acl = ll_get_acl,
2988 /* dynamic ioctl number support routins */
2989 static struct llioc_ctl_data {
2990 struct rw_semaphore ioc_sem;
2991 cfs_list_t ioc_head;
2993 __RWSEM_INITIALIZER(llioc.ioc_sem),
2994 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2999 cfs_list_t iocd_list;
3000 unsigned int iocd_size;
3001 llioc_callback_t iocd_cb;
3002 unsigned int iocd_count;
3003 unsigned int iocd_cmd[0];
3006 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3009 struct llioc_data *in_data = NULL;
3012 if (cb == NULL || cmd == NULL ||
3013 count > LLIOC_MAX_CMD || count < 0)
3016 size = sizeof(*in_data) + count * sizeof(unsigned int);
3017 OBD_ALLOC(in_data, size);
3018 if (in_data == NULL)
3021 memset(in_data, 0, sizeof(*in_data));
3022 in_data->iocd_size = size;
3023 in_data->iocd_cb = cb;
3024 in_data->iocd_count = count;
3025 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3027 down_write(&llioc.ioc_sem);
3028 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3029 up_write(&llioc.ioc_sem);
3034 void ll_iocontrol_unregister(void *magic)
3036 struct llioc_data *tmp;
3041 down_write(&llioc.ioc_sem);
3042 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3044 unsigned int size = tmp->iocd_size;
3046 cfs_list_del(&tmp->iocd_list);
3047 up_write(&llioc.ioc_sem);
3049 OBD_FREE(tmp, size);
3053 up_write(&llioc.ioc_sem);
3055 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3058 EXPORT_SYMBOL(ll_iocontrol_register);
3059 EXPORT_SYMBOL(ll_iocontrol_unregister);
3061 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3062 unsigned int cmd, unsigned long arg, int *rcp)
3064 enum llioc_iter ret = LLIOC_CONT;
3065 struct llioc_data *data;
3066 int rc = -EINVAL, i;
3068 down_read(&llioc.ioc_sem);
3069 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3070 for (i = 0; i < data->iocd_count; i++) {
3071 if (cmd != data->iocd_cmd[i])
3074 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3078 if (ret == LLIOC_STOP)
3081 up_read(&llioc.ioc_sem);
3088 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3090 struct ll_inode_info *lli = ll_i2info(inode);
3091 struct cl_env_nest nest;
3096 if (lli->lli_clob == NULL)
3099 env = cl_env_nested_get(&nest);
3101 RETURN(PTR_ERR(env));
3103 result = cl_conf_set(env, lli->lli_clob, conf);
3104 cl_env_nested_put(&nest, env);
3106 if (conf->coc_opc == OBJECT_CONF_SET) {
3107 struct ldlm_lock *lock = conf->coc_lock;
3109 LASSERT(lock != NULL);
3110 LASSERT(ldlm_has_layout(lock));
3112 /* it can only be allowed to match after layout is
3113 * applied to inode otherwise false layout would be
3114 * seen. Applying layout shoud happen before dropping
3115 * the intent lock. */
3116 ldlm_lock_allow_match(lock);
3123 * Apply the layout to the inode. Layout lock is held and will be released
3126 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3127 struct inode *inode, __u32 *gen, bool reconf)
3129 struct ll_inode_info *lli = ll_i2info(inode);
3130 struct ll_sb_info *sbi = ll_i2sbi(inode);
3131 struct ldlm_lock *lock;
3132 struct lustre_md md = { NULL };
3133 struct cl_object_conf conf;
3138 LASSERT(lustre_handle_is_used(lockh));
3140 lock = ldlm_handle2lock(lockh);
3141 LASSERT(lock != NULL);
3142 LASSERT(ldlm_has_layout(lock));
3144 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3145 inode, PFID(&lli->lli_fid), reconf);
3147 lock_res_and_lock(lock);
3148 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3149 unlock_res_and_lock(lock);
3150 /* checking lvb_ready is racy but this is okay. The worst case is
3151 * that multi processes may configure the file on the same time. */
3152 if (lvb_ready || !reconf) {
3153 LDLM_LOCK_PUT(lock);
3157 /* layout_gen must be valid if layout lock is not
3158 * cancelled and stripe has already set */
3159 *gen = lli->lli_layout_gen;
3162 ldlm_lock_decref(lockh, mode);
3166 /* for layout lock, lmm is returned in lock's lvb.
3167 * lvb_data is immutable if the lock is held so it's safe to access it
3168 * without res lock. See the description in ldlm_lock_decref_internal()
3169 * for the condition to free lvb_data of layout lock */
3170 if (lock->l_lvb_data != NULL) {
3171 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3172 lock->l_lvb_data, lock->l_lvb_len);
3174 *gen = LL_LAYOUT_GEN_EMPTY;
3176 *gen = md.lsm->lsm_layout_gen;
3179 CERROR("%s: file "DFID" unpackmd error: %d\n",
3180 ll_get_fsname(inode->i_sb, NULL, 0),
3181 PFID(&lli->lli_fid), rc);
3185 LDLM_LOCK_PUT(lock);
3186 ldlm_lock_decref(lockh, mode);
3190 /* set layout to file. Unlikely this will fail as old layout was
3191 * surely eliminated */
3192 memset(&conf, 0, sizeof conf);
3193 conf.coc_opc = OBJECT_CONF_SET;
3194 conf.coc_inode = inode;
3195 conf.coc_lock = lock;
3196 conf.u.coc_md = &md;
3197 rc = ll_layout_conf(inode, &conf);
3198 LDLM_LOCK_PUT(lock);
3200 ldlm_lock_decref(lockh, mode);
3203 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3205 /* wait for IO to complete if it's still being used. */
3207 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3208 ll_get_fsname(inode->i_sb, NULL, 0),
3209 inode, PFID(&lli->lli_fid));
3211 memset(&conf, 0, sizeof conf);
3212 conf.coc_opc = OBJECT_CONF_WAIT;
3213 conf.coc_inode = inode;
3214 rc = ll_layout_conf(inode, &conf);
3218 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3219 PFID(&lli->lli_fid), rc);
3226 * This function checks if there exists a LAYOUT lock on the client side,
3227 * or enqueues it if it doesn't have one in cache.
3229 * This function will not hold layout lock so it may be revoked any time after
3230 * this function returns. Any operations depend on layout should be redone
3233 * This function should be called before lov_io_init() to get an uptodate
3234 * layout version, the caller should save the version number and after IO
3235 * is finished, this function should be called again to verify that layout
3236 * is not changed during IO time.
3238 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3240 struct ll_inode_info *lli = ll_i2info(inode);
3241 struct ll_sb_info *sbi = ll_i2sbi(inode);
3242 struct md_op_data *op_data;
3243 struct lookup_intent it;
3244 struct lustre_handle lockh;
3246 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3248 .ei_cb_bl = ll_md_blocking_ast,
3249 .ei_cb_cp = ldlm_completion_ast,
3250 .ei_cbdata = inode };
3254 *gen = LL_LAYOUT_GEN_NONE;
3255 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3259 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3260 LASSERT(S_ISREG(inode->i_mode));
3262 /* mostly layout lock is caching on the local side, so try to match
3263 * it before grabbing layout lock mutex. */
3264 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3265 if (mode != 0) { /* hit cached lock */
3266 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3270 /* better hold lli_layout_mutex to try again otherwise
3271 * it will have starvation problem. */
3274 /* take layout lock mutex to enqueue layout lock exclusively. */
3275 mutex_lock(&lli->lli_layout_mutex);
3278 /* try again. Maybe somebody else has done this. */
3279 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3280 if (mode != 0) { /* hit cached lock */
3281 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3285 mutex_unlock(&lli->lli_layout_mutex);
3289 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3290 0, 0, LUSTRE_OPC_ANY, NULL);
3291 if (IS_ERR(op_data)) {
3292 mutex_unlock(&lli->lli_layout_mutex);
3293 RETURN(PTR_ERR(op_data));
3296 /* have to enqueue one */
3297 memset(&it, 0, sizeof(it));
3298 it.it_op = IT_LAYOUT;
3299 lockh.cookie = 0ULL;
3301 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3302 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3303 PFID(&lli->lli_fid));
3305 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3307 if (it.d.lustre.it_data != NULL)
3308 ptlrpc_req_finished(it.d.lustre.it_data);
3309 it.d.lustre.it_data = NULL;
3311 ll_finish_md_op_data(op_data);
3313 mode = it.d.lustre.it_lock_mode;
3314 it.d.lustre.it_lock_mode = 0;
3315 ll_intent_drop_lock(&it);
3318 /* set lock data in case this is a new lock */
3319 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3320 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3324 mutex_unlock(&lli->lli_layout_mutex);