4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Whamcloud, Inc.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
58 fd->fd_write_failed = false;
62 static void ll_file_data_put(struct ll_file_data *fd)
65 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69 struct lustre_handle *fh)
71 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72 op_data->op_attr.ia_mode = inode->i_mode;
73 op_data->op_attr.ia_atime = inode->i_atime;
74 op_data->op_attr.ia_mtime = inode->i_mtime;
75 op_data->op_attr.ia_ctime = inode->i_ctime;
76 op_data->op_attr.ia_size = i_size_read(inode);
77 op_data->op_attr_blocks = inode->i_blocks;
78 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
79 ll_inode_to_ext_flags(inode->i_flags);
80 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
82 op_data->op_handle = *fh;
83 op_data->op_capa1 = ll_mdscapa_get(inode);
87 * Closes the IO epoch and packs all the attributes into @op_data for
90 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
91 struct obd_client_handle *och)
95 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
96 ATTR_MTIME_SET | ATTR_CTIME_SET;
98 if (!(och->och_flags & FMODE_WRITE))
101 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
102 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
104 ll_ioepoch_close(inode, op_data, &och, 0);
107 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
108 ll_prep_md_op_data(op_data, inode, NULL, NULL,
109 0, 0, LUSTRE_OPC_ANY, NULL);
113 static int ll_close_inode_openhandle(struct obd_export *md_exp,
115 struct obd_client_handle *och)
117 struct obd_export *exp = ll_i2mdexp(inode);
118 struct md_op_data *op_data;
119 struct ptlrpc_request *req = NULL;
120 struct obd_device *obd = class_exp2obd(exp);
127 * XXX: in case of LMV, is this correct to access
130 CERROR("Invalid MDC connection handle "LPX64"\n",
131 ll_i2mdexp(inode)->exp_handle.h_cookie);
135 OBD_ALLOC_PTR(op_data);
137 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
139 ll_prepare_close(inode, op_data, och);
140 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
141 rc = md_close(md_exp, op_data, och->och_mod, &req);
143 /* This close must have the epoch closed. */
144 LASSERT(epoch_close);
145 /* MDS has instructed us to obtain Size-on-MDS attribute from
146 * OSTs and send setattr to back to MDS. */
147 rc = ll_som_update(inode, op_data);
149 CERROR("inode %lu mdc Size-on-MDS update failed: "
150 "rc = %d\n", inode->i_ino, rc);
154 CERROR("inode %lu mdc close failed: rc = %d\n",
157 ll_finish_md_op_data(op_data);
160 rc = ll_objects_destroy(req, inode);
162 CERROR("inode %lu ll_objects destroy: rc = %d\n",
169 if (exp_connect_som(exp) && !epoch_close &&
170 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
171 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
173 md_clear_open_replay_data(md_exp, och);
174 /* Free @och if it is not waiting for DONE_WRITING. */
175 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
178 if (req) /* This is close request */
179 ptlrpc_req_finished(req);
183 int ll_md_real_close(struct inode *inode, int flags)
185 struct ll_inode_info *lli = ll_i2info(inode);
186 struct obd_client_handle **och_p;
187 struct obd_client_handle *och;
192 if (flags & FMODE_WRITE) {
193 och_p = &lli->lli_mds_write_och;
194 och_usecount = &lli->lli_open_fd_write_count;
195 } else if (flags & FMODE_EXEC) {
196 och_p = &lli->lli_mds_exec_och;
197 och_usecount = &lli->lli_open_fd_exec_count;
199 LASSERT(flags & FMODE_READ);
200 och_p = &lli->lli_mds_read_och;
201 och_usecount = &lli->lli_open_fd_read_count;
204 cfs_mutex_lock(&lli->lli_och_mutex);
205 if (*och_usecount) { /* There are still users of this handle, so
207 cfs_mutex_unlock(&lli->lli_och_mutex);
212 cfs_mutex_unlock(&lli->lli_och_mutex);
214 if (och) { /* There might be a race and somebody have freed this och
216 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
223 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
226 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
227 struct ll_inode_info *lli = ll_i2info(inode);
231 /* clear group lock, if present */
232 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
233 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
235 /* Let's see if we have good enough OPEN lock on the file and if
236 we can skip talking to MDS */
237 if (file->f_dentry->d_inode) { /* Can this ever be false? */
239 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
240 struct lustre_handle lockh;
241 struct inode *inode = file->f_dentry->d_inode;
242 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
244 cfs_mutex_lock(&lli->lli_och_mutex);
245 if (fd->fd_omode & FMODE_WRITE) {
247 LASSERT(lli->lli_open_fd_write_count);
248 lli->lli_open_fd_write_count--;
249 } else if (fd->fd_omode & FMODE_EXEC) {
251 LASSERT(lli->lli_open_fd_exec_count);
252 lli->lli_open_fd_exec_count--;
255 LASSERT(lli->lli_open_fd_read_count);
256 lli->lli_open_fd_read_count--;
258 cfs_mutex_unlock(&lli->lli_och_mutex);
260 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
261 LDLM_IBITS, &policy, lockmode,
263 rc = ll_md_real_close(file->f_dentry->d_inode,
267 CERROR("Releasing a file %p with negative dentry %p. Name %s",
268 file, file->f_dentry, file->f_dentry->d_name.name);
271 LUSTRE_FPRIVATE(file) = NULL;
272 ll_file_data_put(fd);
273 ll_capa_close(inode);
278 /* While this returns an error code, fput() the caller does not, so we need
279 * to make every effort to clean up all of our state here. Also, applications
280 * rarely check close errors and even if an error is returned they will not
281 * re-try the close call.
283 int ll_file_release(struct inode *inode, struct file *file)
285 struct ll_file_data *fd;
286 struct ll_sb_info *sbi = ll_i2sbi(inode);
287 struct ll_inode_info *lli = ll_i2info(inode);
291 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
292 inode->i_generation, inode);
294 #ifdef CONFIG_FS_POSIX_ACL
295 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
296 inode == inode->i_sb->s_root->d_inode) {
297 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
300 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
301 fd->fd_flags &= ~LL_FILE_RMTACL;
302 rct_del(&sbi->ll_rct, cfs_curproc_pid());
303 et_search_free(&sbi->ll_et, cfs_curproc_pid());
308 if (inode->i_sb->s_root != file->f_dentry)
309 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
310 fd = LUSTRE_FPRIVATE(file);
313 /* The last ref on @file, maybe not the the owner pid of statahead.
314 * Different processes can open the same dir, "ll_opendir_key" means:
315 * it is me that should stop the statahead thread. */
316 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
317 lli->lli_opendir_pid != 0)
318 ll_stop_statahead(inode, lli->lli_opendir_key);
320 if (inode->i_sb->s_root == file->f_dentry) {
321 LUSTRE_FPRIVATE(file) = NULL;
322 ll_file_data_put(fd);
326 if (!S_ISDIR(inode->i_mode)) {
327 lov_read_and_clear_async_rc(lli->lli_clob);
328 lli->lli_async_rc = 0;
331 rc = ll_md_close(sbi->ll_md_exp, inode, file);
333 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
334 libcfs_debug_dumplog();
339 static int ll_intent_file_open(struct file *file, void *lmm,
340 int lmmsize, struct lookup_intent *itp)
342 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
343 struct dentry *parent = file->f_dentry->d_parent;
344 const char *name = file->f_dentry->d_name.name;
345 const int len = file->f_dentry->d_name.len;
346 struct md_op_data *op_data;
347 struct ptlrpc_request *req;
348 __u32 opc = LUSTRE_OPC_ANY;
355 /* Usually we come here only for NFSD, and we want open lock.
356 But we can also get here with pre 2.6.15 patchless kernels, and in
357 that case that lock is also ok */
358 /* We can also get here if there was cached open handle in revalidate_it
359 * but it disappeared while we were getting from there to ll_file_open.
360 * But this means this file was closed and immediatelly opened which
361 * makes a good candidate for using OPEN lock */
362 /* If lmmsize & lmm are not 0, we are just setting stripe info
363 * parameters. No need for the open lock */
364 if (lmm == NULL && lmmsize == 0) {
365 itp->it_flags |= MDS_OPEN_LOCK;
366 if (itp->it_flags & FMODE_WRITE)
367 opc = LUSTRE_OPC_CREATE;
370 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
371 file->f_dentry->d_inode, name, len,
374 RETURN(PTR_ERR(op_data));
376 itp->it_flags |= MDS_OPEN_BY_FID;
377 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
378 0 /*unused */, &req, ll_md_blocking_ast, 0);
379 ll_finish_md_op_data(op_data);
381 /* reason for keep own exit path - don`t flood log
382 * with messages with -ESTALE errors.
384 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
385 it_open_error(DISP_OPEN_OPEN, itp))
387 ll_release_openhandle(file->f_dentry, itp);
391 if (it_disposition(itp, DISP_LOOKUP_NEG))
392 GOTO(out, rc = -ENOENT);
394 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
395 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
396 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
400 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
401 if (!rc && itp->d.lustre.it_lock_mode)
402 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
406 ptlrpc_req_finished(itp->d.lustre.it_data);
407 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
408 ll_intent_drop_lock(itp);
414 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
415 * not believe attributes if a few ioepoch holders exist. Attributes for
416 * previous ioepoch if new one is opened are also skipped by MDS.
418 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
420 if (ioepoch && lli->lli_ioepoch != ioepoch) {
421 lli->lli_ioepoch = ioepoch;
422 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
423 ioepoch, PFID(&lli->lli_fid));
427 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
428 struct lookup_intent *it, struct obd_client_handle *och)
430 struct ptlrpc_request *req = it->d.lustre.it_data;
431 struct mdt_body *body;
435 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
436 LASSERT(body != NULL); /* reply already checked out */
438 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
439 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
440 och->och_fid = lli->lli_fid;
441 och->och_flags = it->it_flags;
442 ll_ioepoch_open(lli, body->ioepoch);
444 return md_set_open_replay_data(md_exp, och, req);
447 int ll_local_open(struct file *file, struct lookup_intent *it,
448 struct ll_file_data *fd, struct obd_client_handle *och)
450 struct inode *inode = file->f_dentry->d_inode;
451 struct ll_inode_info *lli = ll_i2info(inode);
454 LASSERT(!LUSTRE_FPRIVATE(file));
459 struct ptlrpc_request *req = it->d.lustre.it_data;
460 struct mdt_body *body;
463 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
467 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
468 if ((it->it_flags & FMODE_WRITE) &&
469 (body->valid & OBD_MD_FLSIZE))
470 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
471 lli->lli_ioepoch, PFID(&lli->lli_fid));
474 LUSTRE_FPRIVATE(file) = fd;
475 ll_readahead_init(inode, &fd->fd_ras);
476 fd->fd_omode = it->it_flags;
480 /* Open a file, and (for the very first open) create objects on the OSTs at
481 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
482 * creation or open until ll_lov_setstripe() ioctl is called.
484 * If we already have the stripe MD locally then we don't request it in
485 * md_open(), by passing a lmm_size = 0.
487 * It is up to the application to ensure no other processes open this file
488 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
489 * used. We might be able to avoid races of that sort by getting lli_open_sem
490 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
491 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
493 int ll_file_open(struct inode *inode, struct file *file)
495 struct ll_inode_info *lli = ll_i2info(inode);
496 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
497 .it_flags = file->f_flags };
498 struct obd_client_handle **och_p = NULL;
499 __u64 *och_usecount = NULL;
500 struct ll_file_data *fd;
501 int rc = 0, opendir_set = 0;
504 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
505 inode->i_generation, inode, file->f_flags);
507 it = file->private_data; /* XXX: compat macro */
508 file->private_data = NULL; /* prevent ll_local_open assertion */
510 fd = ll_file_data_get();
512 GOTO(out_och_free, rc = -ENOMEM);
515 if (S_ISDIR(inode->i_mode)) {
516 cfs_spin_lock(&lli->lli_sa_lock);
517 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
518 lli->lli_opendir_pid == 0) {
519 lli->lli_opendir_key = fd;
520 lli->lli_opendir_pid = cfs_curproc_pid();
523 cfs_spin_unlock(&lli->lli_sa_lock);
526 if (inode->i_sb->s_root == file->f_dentry) {
527 LUSTRE_FPRIVATE(file) = fd;
531 if (!it || !it->d.lustre.it_disposition) {
532 /* Convert f_flags into access mode. We cannot use file->f_mode,
533 * because everything but O_ACCMODE mask was stripped from
535 if ((oit.it_flags + 1) & O_ACCMODE)
537 if (file->f_flags & O_TRUNC)
538 oit.it_flags |= FMODE_WRITE;
540 /* kernel only call f_op->open in dentry_open. filp_open calls
541 * dentry_open after call to open_namei that checks permissions.
542 * Only nfsd_open call dentry_open directly without checking
543 * permissions and because of that this code below is safe. */
544 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
545 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
547 /* We do not want O_EXCL here, presumably we opened the file
548 * already? XXX - NFS implications? */
549 oit.it_flags &= ~O_EXCL;
551 /* bug20584, if "it_flags" contains O_CREAT, the file will be
552 * created if necessary, then "IT_CREAT" should be set to keep
553 * consistent with it */
554 if (oit.it_flags & O_CREAT)
555 oit.it_op |= IT_CREAT;
561 /* Let's see if we have file open on MDS already. */
562 if (it->it_flags & FMODE_WRITE) {
563 och_p = &lli->lli_mds_write_och;
564 och_usecount = &lli->lli_open_fd_write_count;
565 } else if (it->it_flags & FMODE_EXEC) {
566 och_p = &lli->lli_mds_exec_och;
567 och_usecount = &lli->lli_open_fd_exec_count;
569 och_p = &lli->lli_mds_read_och;
570 och_usecount = &lli->lli_open_fd_read_count;
573 cfs_mutex_lock(&lli->lli_och_mutex);
574 if (*och_p) { /* Open handle is present */
575 if (it_disposition(it, DISP_OPEN_OPEN)) {
576 /* Well, there's extra open request that we do not need,
577 let's close it somehow. This will decref request. */
578 rc = it_open_error(DISP_OPEN_OPEN, it);
580 cfs_mutex_unlock(&lli->lli_och_mutex);
581 GOTO(out_openerr, rc);
584 ll_release_openhandle(file->f_dentry, it);
588 rc = ll_local_open(file, it, fd, NULL);
591 cfs_mutex_unlock(&lli->lli_och_mutex);
592 GOTO(out_openerr, rc);
595 LASSERT(*och_usecount == 0);
596 if (!it->d.lustre.it_disposition) {
597 /* We cannot just request lock handle now, new ELC code
598 means that one of other OPEN locks for this file
599 could be cancelled, and since blocking ast handler
600 would attempt to grab och_mutex as well, that would
601 result in a deadlock */
602 cfs_mutex_unlock(&lli->lli_och_mutex);
603 it->it_create_mode |= M_CHECK_STALE;
604 rc = ll_intent_file_open(file, NULL, 0, it);
605 it->it_create_mode &= ~M_CHECK_STALE;
607 GOTO(out_openerr, rc);
611 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
613 GOTO(out_och_free, rc = -ENOMEM);
617 /* md_intent_lock() didn't get a request ref if there was an
618 * open error, so don't do cleanup on the request here
620 /* XXX (green): Should not we bail out on any error here, not
621 * just open error? */
622 rc = it_open_error(DISP_OPEN_OPEN, it);
624 GOTO(out_och_free, rc);
626 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
628 rc = ll_local_open(file, it, fd, *och_p);
630 GOTO(out_och_free, rc);
632 cfs_mutex_unlock(&lli->lli_och_mutex);
635 /* Must do this outside lli_och_mutex lock to prevent deadlock where
636 different kind of OPEN lock for this same inode gets cancelled
637 by ldlm_cancel_lru */
638 if (!S_ISREG(inode->i_mode))
639 GOTO(out_och_free, rc);
643 if (!lli->lli_has_smd) {
644 if (file->f_flags & O_LOV_DELAY_CREATE ||
645 !(file->f_mode & FMODE_WRITE)) {
646 CDEBUG(D_INODE, "object creation was delayed\n");
647 GOTO(out_och_free, rc);
650 file->f_flags &= ~O_LOV_DELAY_CREATE;
651 GOTO(out_och_free, rc);
654 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
655 ptlrpc_req_finished(it->d.lustre.it_data);
656 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
660 if (och_p && *och_p) {
661 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
662 *och_p = NULL; /* OBD_FREE writes some magic there */
665 cfs_mutex_unlock(&lli->lli_och_mutex);
668 if (opendir_set != 0)
669 ll_stop_statahead(inode, lli->lli_opendir_key);
671 ll_file_data_put(fd);
673 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
679 /* Fills the obdo with the attributes for the lsm */
680 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
681 struct obd_capa *capa, struct obdo *obdo,
682 __u64 ioepoch, int sync)
684 struct ptlrpc_request_set *set;
685 struct obd_info oinfo = { { { 0 } } };
690 LASSERT(lsm != NULL);
694 oinfo.oi_oa->o_id = lsm->lsm_object_id;
695 oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
696 oinfo.oi_oa->o_mode = S_IFREG;
697 oinfo.oi_oa->o_ioepoch = ioepoch;
698 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
699 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
700 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
701 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
702 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
703 OBD_MD_FLDATAVERSION;
704 oinfo.oi_capa = capa;
706 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
707 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
710 set = ptlrpc_prep_set();
712 CERROR("can't allocate ptlrpc set\n");
715 rc = obd_getattr_async(exp, &oinfo, set);
717 rc = ptlrpc_set_wait(set);
718 ptlrpc_set_destroy(set);
721 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
722 OBD_MD_FLATIME | OBD_MD_FLMTIME |
723 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
724 OBD_MD_FLDATAVERSION);
729 * Performs the getattr on the inode and updates its fields.
730 * If @sync != 0, perform the getattr under the server-side lock.
732 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
733 __u64 ioepoch, int sync)
735 struct obd_capa *capa = ll_mdscapa_get(inode);
736 struct lov_stripe_md *lsm;
740 lsm = ccc_inode_lsm_get(inode);
741 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
742 capa, obdo, ioepoch, sync);
745 obdo_refresh_inode(inode, obdo, obdo->o_valid);
747 "objid "LPX64" size %llu, blocks %llu, blksize %lu\n",
748 lsm ? lsm->lsm_object_id : 0, i_size_read(inode),
749 (unsigned long long)inode->i_blocks,
750 (unsigned long)ll_inode_blksize(inode));
752 ccc_inode_lsm_put(inode, lsm);
756 int ll_merge_lvb(struct inode *inode)
758 struct ll_inode_info *lli = ll_i2info(inode);
759 struct ll_sb_info *sbi = ll_i2sbi(inode);
760 struct lov_stripe_md *lsm;
766 lsm = ccc_inode_lsm_get(inode);
767 ll_inode_size_lock(inode);
768 inode_init_lvb(inode, &lvb);
770 /* merge timestamps the most resently obtained from mds with
771 timestamps obtained from osts */
772 lvb.lvb_atime = lli->lli_lvb.lvb_atime;
773 lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
774 lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
776 rc = obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
777 cl_isize_write_nolock(inode, lvb.lvb_size);
779 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
780 PFID(&lli->lli_fid), lvb.lvb_size);
781 inode->i_blocks = lvb.lvb_blocks;
783 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
784 LTIME_S(inode->i_atime) = lvb.lvb_atime;
785 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
787 ll_inode_size_unlock(inode);
788 ccc_inode_lsm_put(inode, lsm);
793 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
796 struct obdo obdo = { 0 };
799 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
801 st->st_size = obdo.o_size;
802 st->st_blocks = obdo.o_blocks;
803 st->st_mtime = obdo.o_mtime;
804 st->st_atime = obdo.o_atime;
805 st->st_ctime = obdo.o_ctime;
810 void ll_io_init(struct cl_io *io, const struct file *file, int write)
812 struct inode *inode = file->f_dentry->d_inode;
814 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
816 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
817 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC || IS_SYNC(inode);
819 io->ci_obj = ll_i2info(inode)->lli_clob;
820 io->ci_lockreq = CILR_MAYBE;
821 if (ll_file_nolock(file)) {
822 io->ci_lockreq = CILR_NEVER;
823 io->ci_no_srvlock = 1;
824 } else if (file->f_flags & O_APPEND) {
825 io->ci_lockreq = CILR_MANDATORY;
830 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
831 struct file *file, enum cl_io_type iot,
832 loff_t *ppos, size_t count)
834 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
835 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
840 io = ccc_env_thread_io(env);
841 ll_io_init(io, file, iot == CIT_WRITE);
843 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
844 struct vvp_io *vio = vvp_env_io(env);
845 struct ccc_io *cio = ccc_env_io(env);
846 int write_mutex_locked = 0;
848 cio->cui_fd = LUSTRE_FPRIVATE(file);
849 vio->cui_io_subtype = args->via_io_subtype;
851 switch (vio->cui_io_subtype) {
853 cio->cui_iov = args->u.normal.via_iov;
854 cio->cui_nrsegs = args->u.normal.via_nrsegs;
855 cio->cui_tot_nrsegs = cio->cui_nrsegs;
856 #ifndef HAVE_FILE_WRITEV
857 cio->cui_iocb = args->u.normal.via_iocb;
859 if ((iot == CIT_WRITE) &&
860 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
861 if (cfs_mutex_lock_interruptible(&lli->
863 GOTO(out, result = -ERESTARTSYS);
864 write_mutex_locked = 1;
865 } else if (iot == CIT_READ) {
866 cfs_down_read(&lli->lli_trunc_sem);
870 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
871 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
874 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
875 vio->u.splice.cui_flags = args->u.splice.via_flags;
878 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
881 result = cl_io_loop(env, io);
882 if (write_mutex_locked)
883 cfs_mutex_unlock(&lli->lli_write_mutex);
884 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
885 cfs_up_read(&lli->lli_trunc_sem);
887 /* cl_io_rw_init() handled IO */
888 result = io->ci_result;
891 if (io->ci_nob > 0) {
893 *ppos = io->u.ci_wr.wr.crw_pos;
899 if (iot == CIT_READ) {
901 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
902 LPROC_LL_READ_BYTES, result);
903 } else if (iot == CIT_WRITE) {
905 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
906 LPROC_LL_WRITE_BYTES, result);
907 fd->fd_write_failed = false;
909 fd->fd_write_failed = true;
918 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
920 static int ll_file_get_iov_count(const struct iovec *iov,
921 unsigned long *nr_segs, size_t *count)
926 for (seg = 0; seg < *nr_segs; seg++) {
927 const struct iovec *iv = &iov[seg];
930 * If any segment has a negative length, or the cumulative
931 * length ever wraps negative then return -EINVAL.
934 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
936 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
941 cnt -= iv->iov_len; /* This segment is no good */
948 #ifdef HAVE_FILE_READV
949 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
950 unsigned long nr_segs, loff_t *ppos)
953 struct vvp_io_args *args;
959 result = ll_file_get_iov_count(iov, &nr_segs, &count);
963 env = cl_env_get(&refcheck);
965 RETURN(PTR_ERR(env));
967 args = vvp_env_args(env, IO_NORMAL);
968 args->u.normal.via_iov = (struct iovec *)iov;
969 args->u.normal.via_nrsegs = nr_segs;
971 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
972 cl_env_put(env, &refcheck);
976 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
980 struct iovec *local_iov;
985 env = cl_env_get(&refcheck);
987 RETURN(PTR_ERR(env));
989 local_iov = &vvp_env_info(env)->vti_local_iov;
990 local_iov->iov_base = (void __user *)buf;
991 local_iov->iov_len = count;
992 result = ll_file_readv(file, local_iov, 1, ppos);
993 cl_env_put(env, &refcheck);
998 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
999 unsigned long nr_segs, loff_t pos)
1002 struct vvp_io_args *args;
1008 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1012 env = cl_env_get(&refcheck);
1014 RETURN(PTR_ERR(env));
1016 args = vvp_env_args(env, IO_NORMAL);
1017 args->u.normal.via_iov = (struct iovec *)iov;
1018 args->u.normal.via_nrsegs = nr_segs;
1019 args->u.normal.via_iocb = iocb;
1021 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1022 &iocb->ki_pos, count);
1023 cl_env_put(env, &refcheck);
1027 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1031 struct iovec *local_iov;
1032 struct kiocb *kiocb;
1037 env = cl_env_get(&refcheck);
1039 RETURN(PTR_ERR(env));
1041 local_iov = &vvp_env_info(env)->vti_local_iov;
1042 kiocb = &vvp_env_info(env)->vti_kiocb;
1043 local_iov->iov_base = (void __user *)buf;
1044 local_iov->iov_len = count;
1045 init_sync_kiocb(kiocb, file);
1046 kiocb->ki_pos = *ppos;
1047 kiocb->ki_left = count;
1049 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1050 *ppos = kiocb->ki_pos;
1052 cl_env_put(env, &refcheck);
1058 * Write to a file (through the page cache).
1060 #ifdef HAVE_FILE_WRITEV
1061 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1062 unsigned long nr_segs, loff_t *ppos)
1065 struct vvp_io_args *args;
1071 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1075 env = cl_env_get(&refcheck);
1077 RETURN(PTR_ERR(env));
1079 args = vvp_env_args(env, IO_NORMAL);
1080 args->u.normal.via_iov = (struct iovec *)iov;
1081 args->u.normal.via_nrsegs = nr_segs;
1083 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1084 cl_env_put(env, &refcheck);
1088 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1092 struct iovec *local_iov;
1097 env = cl_env_get(&refcheck);
1099 RETURN(PTR_ERR(env));
1101 local_iov = &vvp_env_info(env)->vti_local_iov;
1102 local_iov->iov_base = (void __user *)buf;
1103 local_iov->iov_len = count;
1105 result = ll_file_writev(file, local_iov, 1, ppos);
1106 cl_env_put(env, &refcheck);
1110 #else /* AIO stuff */
1111 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1112 unsigned long nr_segs, loff_t pos)
1115 struct vvp_io_args *args;
1121 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1125 env = cl_env_get(&refcheck);
1127 RETURN(PTR_ERR(env));
1129 args = vvp_env_args(env, IO_NORMAL);
1130 args->u.normal.via_iov = (struct iovec *)iov;
1131 args->u.normal.via_nrsegs = nr_segs;
1132 args->u.normal.via_iocb = iocb;
1134 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1135 &iocb->ki_pos, count);
1136 cl_env_put(env, &refcheck);
1140 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1144 struct iovec *local_iov;
1145 struct kiocb *kiocb;
1150 env = cl_env_get(&refcheck);
1152 RETURN(PTR_ERR(env));
1154 local_iov = &vvp_env_info(env)->vti_local_iov;
1155 kiocb = &vvp_env_info(env)->vti_kiocb;
1156 local_iov->iov_base = (void __user *)buf;
1157 local_iov->iov_len = count;
1158 init_sync_kiocb(kiocb, file);
1159 kiocb->ki_pos = *ppos;
1160 kiocb->ki_left = count;
1162 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1163 *ppos = kiocb->ki_pos;
1165 cl_env_put(env, &refcheck);
1171 #ifdef HAVE_KERNEL_SENDFILE
1173 * Send file content (through pagecache) somewhere with helper
1175 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1176 read_actor_t actor, void *target)
1179 struct vvp_io_args *args;
1184 env = cl_env_get(&refcheck);
1186 RETURN(PTR_ERR(env));
1188 args = vvp_env_args(env, IO_SENDFILE);
1189 args->u.sendfile.via_target = target;
1190 args->u.sendfile.via_actor = actor;
1192 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1193 cl_env_put(env, &refcheck);
1198 #ifdef HAVE_KERNEL_SPLICE_READ
1200 * Send file content (through pagecache) somewhere with helper
1202 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1203 struct pipe_inode_info *pipe, size_t count,
1207 struct vvp_io_args *args;
1212 env = cl_env_get(&refcheck);
1214 RETURN(PTR_ERR(env));
1216 args = vvp_env_args(env, IO_SPLICE);
1217 args->u.splice.via_pipe = pipe;
1218 args->u.splice.via_flags = flags;
1220 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1221 cl_env_put(env, &refcheck);
1226 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1229 struct obd_export *exp = ll_i2dtexp(inode);
1230 struct obd_trans_info oti = { 0 };
1231 struct obdo *oa = NULL;
1234 struct lov_stripe_md *lsm = NULL, *lsm2;
1241 lsm = ccc_inode_lsm_get(inode);
1243 GOTO(out, rc = -ENOENT);
1245 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1246 (lsm->lsm_stripe_count));
1248 OBD_ALLOC_LARGE(lsm2, lsm_size);
1250 GOTO(out, rc = -ENOMEM);
1254 oa->o_nlink = ost_idx;
1255 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1256 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1257 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1258 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1259 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1260 memcpy(lsm2, lsm, lsm_size);
1261 ll_inode_size_lock(inode);
1262 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1263 ll_inode_size_unlock(inode);
1265 OBD_FREE_LARGE(lsm2, lsm_size);
1268 ccc_inode_lsm_put(inode, lsm);
1273 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1275 struct ll_recreate_obj ucreat;
1278 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1281 if (cfs_copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1282 sizeof(struct ll_recreate_obj)))
1285 RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1286 ucreat.lrc_ost_idx));
1289 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1296 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1299 if (cfs_copy_from_user(&fid, (struct lu_fid *)arg,
1300 sizeof(struct lu_fid)))
1303 id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1304 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1305 RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1308 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1309 int flags, struct lov_user_md *lum, int lum_size)
1311 struct lov_stripe_md *lsm = NULL;
1312 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1316 lsm = ccc_inode_lsm_get(inode);
1318 ccc_inode_lsm_put(inode, lsm);
1319 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1324 ll_inode_size_lock(inode);
1325 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1328 rc = oit.d.lustre.it_status;
1330 GOTO(out_req_free, rc);
1332 ll_release_openhandle(file->f_dentry, &oit);
1335 ll_inode_size_unlock(inode);
1336 ll_intent_release(&oit);
1337 ccc_inode_lsm_put(inode, lsm);
1340 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1344 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1345 struct lov_mds_md **lmmp, int *lmm_size,
1346 struct ptlrpc_request **request)
1348 struct ll_sb_info *sbi = ll_i2sbi(inode);
1349 struct mdt_body *body;
1350 struct lov_mds_md *lmm = NULL;
1351 struct ptlrpc_request *req = NULL;
1352 struct md_op_data *op_data;
1355 rc = ll_get_max_mdsize(sbi, &lmmsize);
1359 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1360 strlen(filename), lmmsize,
1361 LUSTRE_OPC_ANY, NULL);
1362 if (IS_ERR(op_data))
1363 RETURN(PTR_ERR(op_data));
1365 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1366 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1367 ll_finish_md_op_data(op_data);
1369 CDEBUG(D_INFO, "md_getattr_name failed "
1370 "on %s: rc %d\n", filename, rc);
1374 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1375 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1377 lmmsize = body->eadatasize;
1379 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1381 GOTO(out, rc = -ENODATA);
1384 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1385 LASSERT(lmm != NULL);
1387 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1388 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1389 GOTO(out, rc = -EPROTO);
1393 * This is coming from the MDS, so is probably in
1394 * little endian. We convert it to host endian before
1395 * passing it to userspace.
1397 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1398 /* if function called for directory - we should
1399 * avoid swab not existent lsm objects */
1400 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1401 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1402 if (S_ISREG(body->mode))
1403 lustre_swab_lov_user_md_objects(
1404 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1405 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1406 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1407 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1408 if (S_ISREG(body->mode))
1409 lustre_swab_lov_user_md_objects(
1410 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1411 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1417 *lmm_size = lmmsize;
1422 static int ll_lov_setea(struct inode *inode, struct file *file,
1425 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1426 struct lov_user_md *lump;
1427 int lum_size = sizeof(struct lov_user_md) +
1428 sizeof(struct lov_user_ost_data);
1432 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1435 OBD_ALLOC_LARGE(lump, lum_size);
1439 if (cfs_copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1440 OBD_FREE_LARGE(lump, lum_size);
1444 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1446 OBD_FREE_LARGE(lump, lum_size);
1450 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1453 struct lov_user_md_v3 lumv3;
1454 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1455 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1456 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1459 int flags = FMODE_WRITE;
1462 /* first try with v1 which is smaller than v3 */
1463 lum_size = sizeof(struct lov_user_md_v1);
1464 if (cfs_copy_from_user(lumv1, lumv1p, lum_size))
1467 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1468 lum_size = sizeof(struct lov_user_md_v3);
1469 if (cfs_copy_from_user(&lumv3, lumv3p, lum_size))
1473 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1475 struct lov_stripe_md *lsm;
1476 put_user(0, &lumv1p->lmm_stripe_count);
1477 lsm = ccc_inode_lsm_get(inode);
1478 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1479 0, lsm, (void *)arg);
1480 ccc_inode_lsm_put(inode, lsm);
1485 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1487 struct lov_stripe_md *lsm;
1491 lsm = ccc_inode_lsm_get(inode);
1493 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1495 ccc_inode_lsm_put(inode, lsm);
1499 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1501 struct ll_inode_info *lli = ll_i2info(inode);
1502 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1503 struct ccc_grouplock grouplock;
1507 if (ll_file_nolock(file))
1508 RETURN(-EOPNOTSUPP);
1510 cfs_spin_lock(&lli->lli_lock);
1511 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1512 CWARN("group lock already existed with gid %lu\n",
1513 fd->fd_grouplock.cg_gid);
1514 cfs_spin_unlock(&lli->lli_lock);
1517 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1518 cfs_spin_unlock(&lli->lli_lock);
1520 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1521 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1525 cfs_spin_lock(&lli->lli_lock);
1526 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1527 cfs_spin_unlock(&lli->lli_lock);
1528 CERROR("another thread just won the race\n");
1529 cl_put_grouplock(&grouplock);
1533 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1534 fd->fd_grouplock = grouplock;
1535 cfs_spin_unlock(&lli->lli_lock);
1537 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1541 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1543 struct ll_inode_info *lli = ll_i2info(inode);
1544 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1545 struct ccc_grouplock grouplock;
1548 cfs_spin_lock(&lli->lli_lock);
1549 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1550 cfs_spin_unlock(&lli->lli_lock);
1551 CWARN("no group lock held\n");
1554 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1556 if (fd->fd_grouplock.cg_gid != arg) {
1557 CWARN("group lock %lu doesn't match current id %lu\n",
1558 arg, fd->fd_grouplock.cg_gid);
1559 cfs_spin_unlock(&lli->lli_lock);
1563 grouplock = fd->fd_grouplock;
1564 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1565 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1566 cfs_spin_unlock(&lli->lli_lock);
1568 cl_put_grouplock(&grouplock);
1569 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1574 * Close inode open handle
1576 * \param dentry [in] dentry which contains the inode
1577 * \param it [in,out] intent which contains open info and result
1580 * \retval <0 failure
1582 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1584 struct inode *inode = dentry->d_inode;
1585 struct obd_client_handle *och;
1591 /* Root ? Do nothing. */
1592 if (dentry->d_inode->i_sb->s_root == dentry)
1595 /* No open handle to close? Move away */
1596 if (!it_disposition(it, DISP_OPEN_OPEN))
1599 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1601 OBD_ALLOC(och, sizeof(*och));
1603 GOTO(out, rc = -ENOMEM);
1605 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1606 ll_i2info(inode), it, och);
1608 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1611 /* this one is in place of ll_file_open */
1612 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1613 ptlrpc_req_finished(it->d.lustre.it_data);
1614 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1620 * Get size for inode for which FIEMAP mapping is requested.
1621 * Make the FIEMAP get_info call and returns the result.
1623 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1626 struct obd_export *exp = ll_i2dtexp(inode);
1627 struct lov_stripe_md *lsm = NULL;
1628 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1629 int vallen = num_bytes;
1633 /* Checks for fiemap flags */
1634 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1635 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1639 /* Check for FIEMAP_FLAG_SYNC */
1640 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1641 rc = filemap_fdatawrite(inode->i_mapping);
1646 lsm = ccc_inode_lsm_get(inode);
1650 /* If the stripe_count > 1 and the application does not understand
1651 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1653 if (lsm->lsm_stripe_count > 1 &&
1654 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1655 GOTO(out, rc = -EOPNOTSUPP);
1657 fm_key.oa.o_id = lsm->lsm_object_id;
1658 fm_key.oa.o_seq = lsm->lsm_object_seq;
1659 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1661 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1662 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1663 /* If filesize is 0, then there would be no objects for mapping */
1664 if (fm_key.oa.o_size == 0) {
1665 fiemap->fm_mapped_extents = 0;
1669 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1671 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1674 CERROR("obd_get_info failed: rc = %d\n", rc);
1677 ccc_inode_lsm_put(inode, lsm);
1681 int ll_fid2path(struct inode *inode, void *arg)
1683 struct obd_export *exp = ll_i2mdexp(inode);
1684 struct getinfo_fid2path *gfout, *gfin;
1688 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1689 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1692 /* Need to get the buflen */
1693 OBD_ALLOC_PTR(gfin);
1696 if (cfs_copy_from_user(gfin, arg, sizeof(*gfin))) {
1701 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1702 OBD_ALLOC(gfout, outsize);
1703 if (gfout == NULL) {
1707 memcpy(gfout, gfin, sizeof(*gfout));
1710 /* Call mdc_iocontrol */
1711 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1714 if (cfs_copy_to_user(arg, gfout, outsize))
1718 OBD_FREE(gfout, outsize);
1722 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1724 struct ll_user_fiemap *fiemap_s;
1725 size_t num_bytes, ret_bytes;
1726 unsigned int extent_count;
1729 /* Get the extent count so we can calculate the size of
1730 * required fiemap buffer */
1731 if (get_user(extent_count,
1732 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1734 num_bytes = sizeof(*fiemap_s) + (extent_count *
1735 sizeof(struct ll_fiemap_extent));
1737 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1738 if (fiemap_s == NULL)
1741 /* get the fiemap value */
1742 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1744 GOTO(error, rc = -EFAULT);
1746 /* If fm_extent_count is non-zero, read the first extent since
1747 * it is used to calculate end_offset and device from previous
1750 if (copy_from_user(&fiemap_s->fm_extents[0],
1751 (char __user *)arg + sizeof(*fiemap_s),
1752 sizeof(struct ll_fiemap_extent)))
1753 GOTO(error, rc = -EFAULT);
1756 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1760 ret_bytes = sizeof(struct ll_user_fiemap);
1762 if (extent_count != 0)
1763 ret_bytes += (fiemap_s->fm_mapped_extents *
1764 sizeof(struct ll_fiemap_extent));
1766 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1770 OBD_FREE_LARGE(fiemap_s, num_bytes);
1775 * Read the data_version for inode.
1777 * This value is computed using stripe object version on OST.
1778 * Version is computed using server side locking.
1780 * @param extent_lock Take extent lock. Not needed if a process is already
1781 * holding the OST object group locks.
1783 static int ll_data_version(struct inode *inode, __u64 *data_version,
1786 struct lov_stripe_md *lsm = NULL;
1787 struct ll_sb_info *sbi = ll_i2sbi(inode);
1788 struct obdo *obdo = NULL;
1792 /* If no stripe, we consider version is 0. */
1793 lsm = ccc_inode_lsm_get(inode);
1796 CDEBUG(D_INODE, "No object for inode\n");
1800 OBD_ALLOC_PTR(obdo);
1802 ccc_inode_lsm_put(inode, lsm);
1806 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1808 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1811 *data_version = obdo->o_data_version;
1815 ccc_inode_lsm_put(inode, lsm);
1820 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1822 struct inode *inode = file->f_dentry->d_inode;
1823 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1828 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1829 inode->i_generation, inode, cmd);
1830 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1832 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1833 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1837 case LL_IOC_GETFLAGS:
1838 /* Get the current value of the file flags */
1839 return put_user(fd->fd_flags, (int *)arg);
1840 case LL_IOC_SETFLAGS:
1841 case LL_IOC_CLRFLAGS:
1842 /* Set or clear specific file flags */
1843 /* XXX This probably needs checks to ensure the flags are
1844 * not abused, and to handle any flag side effects.
1846 if (get_user(flags, (int *) arg))
1849 if (cmd == LL_IOC_SETFLAGS) {
1850 if ((flags & LL_FILE_IGNORE_LOCK) &&
1851 !(file->f_flags & O_DIRECT)) {
1852 CERROR("%s: unable to disable locking on "
1853 "non-O_DIRECT file\n", current->comm);
1857 fd->fd_flags |= flags;
1859 fd->fd_flags &= ~flags;
1862 case LL_IOC_LOV_SETSTRIPE:
1863 RETURN(ll_lov_setstripe(inode, file, arg));
1864 case LL_IOC_LOV_SETEA:
1865 RETURN(ll_lov_setea(inode, file, arg));
1866 case LL_IOC_LOV_GETSTRIPE:
1867 RETURN(ll_lov_getstripe(inode, arg));
1868 case LL_IOC_RECREATE_OBJ:
1869 RETURN(ll_lov_recreate_obj(inode, arg));
1870 case LL_IOC_RECREATE_FID:
1871 RETURN(ll_lov_recreate_fid(inode, arg));
1872 case FSFILT_IOC_FIEMAP:
1873 RETURN(ll_ioctl_fiemap(inode, arg));
1874 case FSFILT_IOC_GETFLAGS:
1875 case FSFILT_IOC_SETFLAGS:
1876 RETURN(ll_iocontrol(inode, file, cmd, arg));
1877 case FSFILT_IOC_GETVERSION_OLD:
1878 case FSFILT_IOC_GETVERSION:
1879 RETURN(put_user(inode->i_generation, (int *)arg));
1880 case LL_IOC_GROUP_LOCK:
1881 RETURN(ll_get_grouplock(inode, file, arg));
1882 case LL_IOC_GROUP_UNLOCK:
1883 RETURN(ll_put_grouplock(inode, file, arg));
1884 case IOC_OBD_STATFS:
1885 RETURN(ll_obd_statfs(inode, (void *)arg));
1887 /* We need to special case any other ioctls we want to handle,
1888 * to send them to the MDS/OST as appropriate and to properly
1889 * network encode the arg field.
1890 case FSFILT_IOC_SETVERSION_OLD:
1891 case FSFILT_IOC_SETVERSION:
1893 case LL_IOC_FLUSHCTX:
1894 RETURN(ll_flush_ctx(inode));
1895 case LL_IOC_PATH2FID: {
1896 if (cfs_copy_to_user((void *)arg, ll_inode2fid(inode),
1897 sizeof(struct lu_fid)))
1902 case OBD_IOC_FID2PATH:
1903 RETURN(ll_fid2path(inode, (void *)arg));
1904 case LL_IOC_DATA_VERSION: {
1905 struct ioc_data_version idv;
1908 if (cfs_copy_from_user(&idv, (char *)arg, sizeof(idv)))
1911 rc = ll_data_version(inode, &idv.idv_version,
1912 !(idv.idv_flags & LL_DV_NOFLUSH));
1915 cfs_copy_to_user((char *) arg, &idv, sizeof(idv)))
1921 case LL_IOC_GET_MDTIDX: {
1924 mdtidx = ll_get_mdt_idx(inode);
1928 if (put_user((int)mdtidx, (int*)arg))
1933 case OBD_IOC_GETDTNAME:
1934 case OBD_IOC_GETMDNAME:
1935 RETURN(ll_get_obd_name(inode, cmd, arg));
1940 ll_iocontrol_call(inode, file, cmd, arg, &err))
1943 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1949 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1951 struct inode *inode = file->f_dentry->d_inode;
1954 retval = offset + ((origin == 2) ? i_size_read(inode) :
1955 (origin == 1) ? file->f_pos : 0);
1956 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%s)\n",
1957 inode->i_ino, inode->i_generation, inode, retval, retval,
1958 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1959 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1961 if (origin == 2) { /* SEEK_END */
1964 rc = ll_glimpse_size(inode);
1968 offset += i_size_read(inode);
1969 } else if (origin == 1) { /* SEEK_CUR */
1970 offset += file->f_pos;
1974 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1975 if (offset != file->f_pos) {
1976 file->f_pos = offset;
1984 int ll_flush(struct file *file, fl_owner_t id)
1986 struct inode *inode = file->f_dentry->d_inode;
1987 struct ll_inode_info *lli = ll_i2info(inode);
1988 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1991 LASSERT(!S_ISDIR(inode->i_mode));
1993 /* catch async errors that were recorded back when async writeback
1994 * failed for pages in this mapping. */
1995 rc = lli->lli_async_rc;
1996 lli->lli_async_rc = 0;
1997 err = lov_read_and_clear_async_rc(lli->lli_clob);
2001 /* The application has been told write failure already.
2002 * Do not report failure again. */
2003 if (fd->fd_write_failed)
2005 return rc ? -EIO : 0;
2009 * Called to make sure a portion of file has been written out.
2010 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2012 * Return how many pages have been written.
2014 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2015 enum cl_fsync_mode mode)
2017 struct cl_env_nest nest;
2020 struct obd_capa *capa = NULL;
2021 struct cl_fsync_io *fio;
2025 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2026 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2029 env = cl_env_nested_get(&nest);
2031 RETURN(PTR_ERR(env));
2033 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2035 io = ccc_env_thread_io(env);
2036 io->ci_obj = cl_i2info(inode)->lli_clob;
2037 io->ci_ignore_layout = 1;
2039 /* initialize parameters for sync */
2040 fio = &io->u.ci_fsync;
2041 fio->fi_capa = capa;
2042 fio->fi_start = start;
2044 fio->fi_fid = ll_inode2fid(inode);
2045 fio->fi_mode = mode;
2046 fio->fi_nr_written = 0;
2048 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2049 result = cl_io_loop(env, io);
2051 result = io->ci_result;
2053 result = fio->fi_nr_written;
2054 cl_io_fini(env, io);
2055 cl_env_nested_put(&nest, env);
2062 #ifdef HAVE_FILE_FSYNC_4ARGS
2063 int ll_fsync(struct file *file, loff_t start, loff_t end, int data)
2064 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2065 int ll_fsync(struct file *file, int data)
2067 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2070 struct inode *inode = file->f_dentry->d_inode;
2071 struct ll_inode_info *lli = ll_i2info(inode);
2072 struct ptlrpc_request *req;
2073 struct obd_capa *oc;
2074 struct lov_stripe_md *lsm;
2077 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2078 inode->i_generation, inode);
2079 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2081 #ifdef HAVE_FILE_FSYNC_4ARGS
2082 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2083 mutex_lock(&inode->i_mutex);
2085 /* fsync's caller has already called _fdata{sync,write}, we want
2086 * that IO to finish before calling the osc and mdc sync methods */
2087 rc = filemap_fdatawait(inode->i_mapping);
2090 /* catch async errors that were recorded back when async writeback
2091 * failed for pages in this mapping. */
2092 if (!S_ISDIR(inode->i_mode)) {
2093 err = lli->lli_async_rc;
2094 lli->lli_async_rc = 0;
2097 err = lov_read_and_clear_async_rc(lli->lli_clob);
2102 oc = ll_mdscapa_get(inode);
2103 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2109 ptlrpc_req_finished(req);
2111 lsm = ccc_inode_lsm_get(inode);
2113 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2115 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2117 if (rc == 0 && err < 0)
2120 fd->fd_write_failed = true;
2122 fd->fd_write_failed = false;
2124 ccc_inode_lsm_put(inode, lsm);
2126 #ifdef HAVE_FILE_FSYNC_4ARGS
2127 mutex_unlock(&inode->i_mutex);
2132 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2134 struct inode *inode = file->f_dentry->d_inode;
2135 struct ll_sb_info *sbi = ll_i2sbi(inode);
2136 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2137 .ei_cb_cp =ldlm_flock_completion_ast,
2138 .ei_cbdata = file_lock };
2139 struct md_op_data *op_data;
2140 struct lustre_handle lockh = {0};
2141 ldlm_policy_data_t flock = {{0}};
2146 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2147 inode->i_ino, file_lock);
2149 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2151 if (file_lock->fl_flags & FL_FLOCK) {
2152 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2153 /* flocks are whole-file locks */
2154 flock.l_flock.end = OFFSET_MAX;
2155 /* For flocks owner is determined by the local file desctiptor*/
2156 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2157 } else if (file_lock->fl_flags & FL_POSIX) {
2158 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2159 flock.l_flock.start = file_lock->fl_start;
2160 flock.l_flock.end = file_lock->fl_end;
2164 flock.l_flock.pid = file_lock->fl_pid;
2166 /* Somewhat ugly workaround for svc lockd.
2167 * lockd installs custom fl_lmops->lm_compare_owner that checks
2168 * for the fl_owner to be the same (which it always is on local node
2169 * I guess between lockd processes) and then compares pid.
2170 * As such we assign pid to the owner field to make it all work,
2171 * conflict with normal locks is unlikely since pid space and
2172 * pointer space for current->files are not intersecting */
2173 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2174 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2176 switch (file_lock->fl_type) {
2178 einfo.ei_mode = LCK_PR;
2181 /* An unlock request may or may not have any relation to
2182 * existing locks so we may not be able to pass a lock handle
2183 * via a normal ldlm_lock_cancel() request. The request may even
2184 * unlock a byte range in the middle of an existing lock. In
2185 * order to process an unlock request we need all of the same
2186 * information that is given with a normal read or write record
2187 * lock request. To avoid creating another ldlm unlock (cancel)
2188 * message we'll treat a LCK_NL flock request as an unlock. */
2189 einfo.ei_mode = LCK_NL;
2192 einfo.ei_mode = LCK_PW;
2195 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2196 file_lock->fl_type);
2211 flags = LDLM_FL_BLOCK_NOWAIT;
2217 flags = LDLM_FL_TEST_LOCK;
2218 /* Save the old mode so that if the mode in the lock changes we
2219 * can decrement the appropriate reader or writer refcount. */
2220 file_lock->fl_type = einfo.ei_mode;
2223 CERROR("unknown fcntl lock command: %d\n", cmd);
2227 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2228 LUSTRE_OPC_ANY, NULL);
2229 if (IS_ERR(op_data))
2230 RETURN(PTR_ERR(op_data));
2232 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2233 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2234 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2236 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2237 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2239 ll_finish_md_op_data(op_data);
2241 if ((file_lock->fl_flags & FL_FLOCK) &&
2242 (rc == 0 || file_lock->fl_type == F_UNLCK))
2243 flock_lock_file_wait(file, file_lock);
2244 if ((file_lock->fl_flags & FL_POSIX) &&
2245 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2246 !(flags & LDLM_FL_TEST_LOCK))
2247 posix_lock_file_wait(file, file_lock);
2252 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2260 * test if some locks matching bits and l_req_mode are acquired
2261 * - bits can be in different locks
2262 * - if found clear the common lock bits in *bits
2263 * - the bits not found, are kept in *bits
2265 * \param bits [IN] searched lock bits [IN]
2266 * \param l_req_mode [IN] searched lock mode
2267 * \retval boolean, true iff all bits are found
2269 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2271 struct lustre_handle lockh;
2272 ldlm_policy_data_t policy;
2273 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2274 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2283 fid = &ll_i2info(inode)->lli_fid;
2284 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2285 ldlm_lockname[mode]);
2287 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2288 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2289 policy.l_inodebits.bits = *bits & (1 << i);
2290 if (policy.l_inodebits.bits == 0)
2293 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2294 &policy, mode, &lockh)) {
2295 struct ldlm_lock *lock;
2297 lock = ldlm_handle2lock(&lockh);
2300 ~(lock->l_policy_data.l_inodebits.bits);
2301 LDLM_LOCK_PUT(lock);
2303 *bits &= ~policy.l_inodebits.bits;
2310 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2311 struct lustre_handle *lockh)
2313 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2319 fid = &ll_i2info(inode)->lli_fid;
2320 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2322 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2323 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2324 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2328 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2329 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2330 * and return success */
2332 /* This path cannot be hit for regular files unless in
2333 * case of obscure races, so no need to to validate
2335 if (!S_ISREG(inode->i_mode) &&
2336 !S_ISDIR(inode->i_mode))
2341 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2349 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2352 struct inode *inode = dentry->d_inode;
2353 struct ptlrpc_request *req = NULL;
2354 struct obd_export *exp;
2359 CERROR("REPORT THIS LINE TO PETER\n");
2363 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2364 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2366 exp = ll_i2mdexp(inode);
2368 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2369 * But under CMD case, it caused some lock issues, should be fixed
2370 * with new CMD ibits lock. See bug 12718 */
2371 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2372 struct lookup_intent oit = { .it_op = IT_GETATTR };
2373 struct md_op_data *op_data;
2375 if (ibits == MDS_INODELOCK_LOOKUP)
2376 oit.it_op = IT_LOOKUP;
2378 /* Call getattr by fid, so do not provide name at all. */
2379 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2380 dentry->d_inode, NULL, 0, 0,
2381 LUSTRE_OPC_ANY, NULL);
2382 if (IS_ERR(op_data))
2383 RETURN(PTR_ERR(op_data));
2385 oit.it_create_mode |= M_CHECK_STALE;
2386 rc = md_intent_lock(exp, op_data, NULL, 0,
2387 /* we are not interested in name
2390 ll_md_blocking_ast, 0);
2391 ll_finish_md_op_data(op_data);
2392 oit.it_create_mode &= ~M_CHECK_STALE;
2394 rc = ll_inode_revalidate_fini(inode, rc);
2398 rc = ll_revalidate_it_finish(req, &oit, dentry);
2400 ll_intent_release(&oit);
2404 /* Unlinked? Unhash dentry, so it is not picked up later by
2405 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2406 here to preserve get_cwd functionality on 2.6.
2408 if (!dentry->d_inode->i_nlink)
2409 d_lustre_invalidate(dentry);
2411 ll_lookup_finish_locks(&oit, dentry);
2412 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2413 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2414 obd_valid valid = OBD_MD_FLGETATTR;
2415 struct md_op_data *op_data;
2418 if (S_ISREG(inode->i_mode)) {
2419 rc = ll_get_max_mdsize(sbi, &ealen);
2422 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2425 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2426 0, ealen, LUSTRE_OPC_ANY,
2428 if (IS_ERR(op_data))
2429 RETURN(PTR_ERR(op_data));
2431 op_data->op_valid = valid;
2432 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2433 * capa for this inode. Because we only keep capas of dirs
2435 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2436 ll_finish_md_op_data(op_data);
2438 rc = ll_inode_revalidate_fini(inode, rc);
2442 rc = ll_prep_inode(&inode, req, NULL);
2445 ptlrpc_req_finished(req);
2449 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2452 struct inode *inode = dentry->d_inode;
2456 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2458 /* if object not yet allocated, don't validate size */
2459 if (rc == 0 && !ll_i2info(dentry->d_inode)->lli_has_smd) {
2460 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2461 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2462 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2466 /* ll_glimpse_size will prefer locally cached writes if they extend
2470 rc = ll_glimpse_size(inode);
2475 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2476 struct lookup_intent *it, struct kstat *stat)
2478 struct inode *inode = de->d_inode;
2479 struct ll_sb_info *sbi = ll_i2sbi(inode);
2480 struct ll_inode_info *lli = ll_i2info(inode);
2483 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2484 MDS_INODELOCK_LOOKUP);
2485 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2490 stat->dev = inode->i_sb->s_dev;
2491 if (ll_need_32bit_api(sbi))
2492 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2494 stat->ino = inode->i_ino;
2495 stat->mode = inode->i_mode;
2496 stat->nlink = inode->i_nlink;
2497 stat->uid = inode->i_uid;
2498 stat->gid = inode->i_gid;
2499 stat->rdev = inode->i_rdev;
2500 stat->atime = inode->i_atime;
2501 stat->mtime = inode->i_mtime;
2502 stat->ctime = inode->i_ctime;
2503 stat->blksize = 1 << inode->i_blkbits;
2505 stat->size = i_size_read(inode);
2506 stat->blocks = inode->i_blocks;
2510 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2512 struct lookup_intent it = { .it_op = IT_GETATTR };
2514 return ll_getattr_it(mnt, de, &it, stat);
2517 #ifdef HAVE_LINUX_FIEMAP_H
2518 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2519 __u64 start, __u64 len)
2523 struct ll_user_fiemap *fiemap;
2524 unsigned int extent_count = fieinfo->fi_extents_max;
2526 num_bytes = sizeof(*fiemap) + (extent_count *
2527 sizeof(struct ll_fiemap_extent));
2528 OBD_ALLOC_LARGE(fiemap, num_bytes);
2533 fiemap->fm_flags = fieinfo->fi_flags;
2534 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2535 fiemap->fm_start = start;
2536 fiemap->fm_length = len;
2537 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2538 sizeof(struct ll_fiemap_extent));
2540 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2542 fieinfo->fi_flags = fiemap->fm_flags;
2543 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2544 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2545 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2547 OBD_FREE_LARGE(fiemap, num_bytes);
2552 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2554 struct ll_inode_info *lli = ll_i2info(inode);
2555 struct posix_acl *acl = NULL;
2558 cfs_spin_lock(&lli->lli_lock);
2559 /* VFS' acl_permission_check->check_acl will release the refcount */
2560 acl = posix_acl_dup(lli->lli_posix_acl);
2561 cfs_spin_unlock(&lli->lli_lock);
2566 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2568 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2569 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2571 ll_check_acl(struct inode *inode, int mask)
2574 # ifdef CONFIG_FS_POSIX_ACL
2575 struct posix_acl *acl;
2579 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2580 if (flags & IPERM_FLAG_RCU)
2583 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2588 rc = posix_acl_permission(inode, acl, mask);
2589 posix_acl_release(acl);
2592 # else /* !CONFIG_FS_POSIX_ACL */
2594 # endif /* CONFIG_FS_POSIX_ACL */
2596 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2598 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2599 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2601 # ifdef HAVE_INODE_PERMISION_2ARGS
2602 int ll_inode_permission(struct inode *inode, int mask)
2604 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2611 #ifdef MAY_NOT_BLOCK
2612 if (mask & MAY_NOT_BLOCK)
2614 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2615 if (flags & IPERM_FLAG_RCU)
2619 /* as root inode are NOT getting validated in lookup operation,
2620 * need to do it before permission check. */
2622 if (inode == inode->i_sb->s_root->d_inode) {
2623 struct lookup_intent it = { .it_op = IT_LOOKUP };
2625 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2626 MDS_INODELOCK_LOOKUP);
2631 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2632 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2634 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2635 return lustre_check_remote_perm(inode, mask);
2637 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2638 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
2643 #ifdef HAVE_FILE_READV
2644 #define READ_METHOD readv
2645 #define READ_FUNCTION ll_file_readv
2646 #define WRITE_METHOD writev
2647 #define WRITE_FUNCTION ll_file_writev
2649 #define READ_METHOD aio_read
2650 #define READ_FUNCTION ll_file_aio_read
2651 #define WRITE_METHOD aio_write
2652 #define WRITE_FUNCTION ll_file_aio_write
2655 /* -o localflock - only provides locally consistent flock locks */
2656 struct file_operations ll_file_operations = {
2657 .read = ll_file_read,
2658 .READ_METHOD = READ_FUNCTION,
2659 .write = ll_file_write,
2660 .WRITE_METHOD = WRITE_FUNCTION,
2661 .unlocked_ioctl = ll_file_ioctl,
2662 .open = ll_file_open,
2663 .release = ll_file_release,
2664 .mmap = ll_file_mmap,
2665 .llseek = ll_file_seek,
2666 #ifdef HAVE_KERNEL_SENDFILE
2667 .sendfile = ll_file_sendfile,
2669 #ifdef HAVE_KERNEL_SPLICE_READ
2670 .splice_read = ll_file_splice_read,
2676 struct file_operations ll_file_operations_flock = {
2677 .read = ll_file_read,
2678 .READ_METHOD = READ_FUNCTION,
2679 .write = ll_file_write,
2680 .WRITE_METHOD = WRITE_FUNCTION,
2681 .unlocked_ioctl = ll_file_ioctl,
2682 .open = ll_file_open,
2683 .release = ll_file_release,
2684 .mmap = ll_file_mmap,
2685 .llseek = ll_file_seek,
2686 #ifdef HAVE_KERNEL_SENDFILE
2687 .sendfile = ll_file_sendfile,
2689 #ifdef HAVE_KERNEL_SPLICE_READ
2690 .splice_read = ll_file_splice_read,
2694 .flock = ll_file_flock,
2695 .lock = ll_file_flock
2698 /* These are for -o noflock - to return ENOSYS on flock calls */
2699 struct file_operations ll_file_operations_noflock = {
2700 .read = ll_file_read,
2701 .READ_METHOD = READ_FUNCTION,
2702 .write = ll_file_write,
2703 .WRITE_METHOD = WRITE_FUNCTION,
2704 .unlocked_ioctl = ll_file_ioctl,
2705 .open = ll_file_open,
2706 .release = ll_file_release,
2707 .mmap = ll_file_mmap,
2708 .llseek = ll_file_seek,
2709 #ifdef HAVE_KERNEL_SENDFILE
2710 .sendfile = ll_file_sendfile,
2712 #ifdef HAVE_KERNEL_SPLICE_READ
2713 .splice_read = ll_file_splice_read,
2717 .flock = ll_file_noflock,
2718 .lock = ll_file_noflock
2721 struct inode_operations ll_file_inode_operations = {
2722 .setattr = ll_setattr,
2723 .getattr = ll_getattr,
2724 .permission = ll_inode_permission,
2725 .setxattr = ll_setxattr,
2726 .getxattr = ll_getxattr,
2727 .listxattr = ll_listxattr,
2728 .removexattr = ll_removexattr,
2729 #ifdef HAVE_LINUX_FIEMAP_H
2730 .fiemap = ll_fiemap,
2732 #ifdef HAVE_IOP_GET_ACL
2733 .get_acl = ll_get_acl,
2737 /* dynamic ioctl number support routins */
2738 static struct llioc_ctl_data {
2739 cfs_rw_semaphore_t ioc_sem;
2740 cfs_list_t ioc_head;
2742 __RWSEM_INITIALIZER(llioc.ioc_sem),
2743 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2748 cfs_list_t iocd_list;
2749 unsigned int iocd_size;
2750 llioc_callback_t iocd_cb;
2751 unsigned int iocd_count;
2752 unsigned int iocd_cmd[0];
2755 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2758 struct llioc_data *in_data = NULL;
2761 if (cb == NULL || cmd == NULL ||
2762 count > LLIOC_MAX_CMD || count < 0)
2765 size = sizeof(*in_data) + count * sizeof(unsigned int);
2766 OBD_ALLOC(in_data, size);
2767 if (in_data == NULL)
2770 memset(in_data, 0, sizeof(*in_data));
2771 in_data->iocd_size = size;
2772 in_data->iocd_cb = cb;
2773 in_data->iocd_count = count;
2774 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2776 cfs_down_write(&llioc.ioc_sem);
2777 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2778 cfs_up_write(&llioc.ioc_sem);
2783 void ll_iocontrol_unregister(void *magic)
2785 struct llioc_data *tmp;
2790 cfs_down_write(&llioc.ioc_sem);
2791 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2793 unsigned int size = tmp->iocd_size;
2795 cfs_list_del(&tmp->iocd_list);
2796 cfs_up_write(&llioc.ioc_sem);
2798 OBD_FREE(tmp, size);
2802 cfs_up_write(&llioc.ioc_sem);
2804 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2807 EXPORT_SYMBOL(ll_iocontrol_register);
2808 EXPORT_SYMBOL(ll_iocontrol_unregister);
2810 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2811 unsigned int cmd, unsigned long arg, int *rcp)
2813 enum llioc_iter ret = LLIOC_CONT;
2814 struct llioc_data *data;
2815 int rc = -EINVAL, i;
2817 cfs_down_read(&llioc.ioc_sem);
2818 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2819 for (i = 0; i < data->iocd_count; i++) {
2820 if (cmd != data->iocd_cmd[i])
2823 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2827 if (ret == LLIOC_STOP)
2830 cfs_up_read(&llioc.ioc_sem);
2837 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
2839 struct ll_inode_info *lli = ll_i2info(inode);
2840 struct cl_env_nest nest;
2845 if (lli->lli_clob == NULL)
2848 env = cl_env_nested_get(&nest);
2850 RETURN(PTR_ERR(env));
2852 result = cl_conf_set(env, lli->lli_clob, conf);
2853 cl_env_nested_put(&nest, env);
2858 * This function checks if there exists a LAYOUT lock on the client side,
2859 * or enqueues it if it doesn't have one in cache.
2861 * This function will not hold layout lock so it may be revoked any time after
2862 * this function returns. Any operations depend on layout should be redone
2865 * This function should be called before lov_io_init() to get an uptodate
2866 * layout version, the caller should save the version number and after IO
2867 * is finished, this function should be called again to verify that layout
2868 * is not changed during IO time.
2870 int ll_layout_refresh(struct inode *inode, __u32 *gen)
2872 struct ll_inode_info *lli = ll_i2info(inode);
2873 struct ll_sb_info *sbi = ll_i2sbi(inode);
2874 struct md_op_data *op_data = NULL;
2875 struct ptlrpc_request *req = NULL;
2876 struct lookup_intent it = { .it_op = IT_LAYOUT };
2877 struct lustre_handle lockh;
2879 struct cl_object_conf conf = { .coc_inode = inode,
2880 .coc_validate_only = true };
2885 if (!(ll_i2sbi(inode)->ll_flags & LL_SBI_LAYOUT_LOCK))
2889 LASSERT(fid_is_sane(ll_inode2fid(inode)));
2890 LASSERT(S_ISREG(inode->i_mode));
2892 /* mostly layout lock is caching on the local side, so try to match
2893 * it before grabbing layout lock mutex. */
2894 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh);
2895 if (mode != 0) { /* hit cached lock */
2896 struct lov_stripe_md *lsm;
2898 lsm = ccc_inode_lsm_get(inode);
2900 *gen = lsm->lsm_layout_gen;
2901 ccc_inode_lsm_put(inode, lsm);
2902 ldlm_lock_decref(&lockh, mode);
2907 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
2908 0, 0, LUSTRE_OPC_ANY, NULL);
2909 if (IS_ERR(op_data))
2910 RETURN(PTR_ERR(op_data));
2912 /* take layout lock mutex to enqueue layout lock exclusively. */
2913 cfs_mutex_lock(&lli->lli_layout_mutex);
2915 /* make sure the old conf goes away */
2916 ll_layout_conf(inode, &conf);
2918 /* enqueue layout lock */
2919 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0,
2920 &req, ll_md_blocking_ast, 0);
2922 /* we get a new lock, so update the lock data */
2923 lockh.cookie = it.d.lustre.it_lock_handle;
2924 md_set_lock_data(sbi->ll_md_exp, &lockh.cookie, inode, NULL);
2926 /* req == NULL is when lock was found in client cache, without
2927 * any request to server (but lsm can be canceled just after a
2930 struct ldlm_lock *lock = ldlm_handle2lock(&lockh);
2931 struct lustre_md md = { NULL };
2935 /* for IT_LAYOUT lock, lmm is returned in lock's lvb
2936 * data via completion callback */
2937 LASSERT(lock != NULL);
2938 lmm = lock->l_lvb_data;
2939 lmmsize = lock->l_lvb_len;
2941 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
2945 *gen = md.lsm->lsm_layout_gen;
2947 memset(&conf, 0, sizeof conf);
2948 conf.coc_inode = inode;
2949 conf.u.coc_md = &md;
2950 ll_layout_conf(inode, &conf);
2952 lli->lli_has_smd = md.lsm != NULL;
2955 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
2957 LDLM_LOCK_PUT(lock);
2958 ptlrpc_req_finished(req);
2959 } else { /* hit caching lock */
2960 struct lov_stripe_md *lsm;
2962 lsm = ccc_inode_lsm_get(inode);
2964 *gen = lsm->lsm_layout_gen;
2965 ccc_inode_lsm_put(inode, lsm);
2967 ll_intent_drop_lock(&it);
2969 cfs_mutex_unlock(&lli->lli_layout_mutex);
2970 ll_finish_md_op_data(op_data);