4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Whamcloud, Inc.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
58 fd->fd_write_failed = false;
62 static void ll_file_data_put(struct ll_file_data *fd)
65 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69 struct lustre_handle *fh)
71 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72 op_data->op_attr.ia_mode = inode->i_mode;
73 op_data->op_attr.ia_atime = inode->i_atime;
74 op_data->op_attr.ia_mtime = inode->i_mtime;
75 op_data->op_attr.ia_ctime = inode->i_ctime;
76 op_data->op_attr.ia_size = i_size_read(inode);
77 op_data->op_attr_blocks = inode->i_blocks;
78 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
79 ll_inode_to_ext_flags(inode->i_flags);
80 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
82 op_data->op_handle = *fh;
83 op_data->op_capa1 = ll_mdscapa_get(inode);
87 * Closes the IO epoch and packs all the attributes into @op_data for
90 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
91 struct obd_client_handle *och)
95 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
96 ATTR_MTIME_SET | ATTR_CTIME_SET;
98 if (!(och->och_flags & FMODE_WRITE))
101 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
102 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
104 ll_ioepoch_close(inode, op_data, &och, 0);
107 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
108 ll_prep_md_op_data(op_data, inode, NULL, NULL,
109 0, 0, LUSTRE_OPC_ANY, NULL);
113 static int ll_close_inode_openhandle(struct obd_export *md_exp,
115 struct obd_client_handle *och)
117 struct obd_export *exp = ll_i2mdexp(inode);
118 struct md_op_data *op_data;
119 struct ptlrpc_request *req = NULL;
120 struct obd_device *obd = class_exp2obd(exp);
127 * XXX: in case of LMV, is this correct to access
130 CERROR("Invalid MDC connection handle "LPX64"\n",
131 ll_i2mdexp(inode)->exp_handle.h_cookie);
135 OBD_ALLOC_PTR(op_data);
137 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
139 ll_prepare_close(inode, op_data, och);
140 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
141 rc = md_close(md_exp, op_data, och->och_mod, &req);
143 /* This close must have the epoch closed. */
144 LASSERT(epoch_close);
145 /* MDS has instructed us to obtain Size-on-MDS attribute from
146 * OSTs and send setattr to back to MDS. */
147 rc = ll_som_update(inode, op_data);
149 CERROR("inode %lu mdc Size-on-MDS update failed: "
150 "rc = %d\n", inode->i_ino, rc);
154 CERROR("inode %lu mdc close failed: rc = %d\n",
157 ll_finish_md_op_data(op_data);
160 rc = ll_objects_destroy(req, inode);
162 CERROR("inode %lu ll_objects destroy: rc = %d\n",
169 if (exp_connect_som(exp) && !epoch_close &&
170 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
171 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
173 md_clear_open_replay_data(md_exp, och);
174 /* Free @och if it is not waiting for DONE_WRITING. */
175 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
178 if (req) /* This is close request */
179 ptlrpc_req_finished(req);
183 int ll_md_real_close(struct inode *inode, int flags)
185 struct ll_inode_info *lli = ll_i2info(inode);
186 struct obd_client_handle **och_p;
187 struct obd_client_handle *och;
192 if (flags & FMODE_WRITE) {
193 och_p = &lli->lli_mds_write_och;
194 och_usecount = &lli->lli_open_fd_write_count;
195 } else if (flags & FMODE_EXEC) {
196 och_p = &lli->lli_mds_exec_och;
197 och_usecount = &lli->lli_open_fd_exec_count;
199 LASSERT(flags & FMODE_READ);
200 och_p = &lli->lli_mds_read_och;
201 och_usecount = &lli->lli_open_fd_read_count;
204 mutex_lock(&lli->lli_och_mutex);
205 if (*och_usecount) { /* There are still users of this handle, so
207 mutex_unlock(&lli->lli_och_mutex);
212 mutex_unlock(&lli->lli_och_mutex);
214 if (och) { /* There might be a race and somebody have freed this och
216 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
223 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
226 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
227 struct ll_inode_info *lli = ll_i2info(inode);
231 /* clear group lock, if present */
232 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
233 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
235 /* Let's see if we have good enough OPEN lock on the file and if
236 we can skip talking to MDS */
237 if (file->f_dentry->d_inode) { /* Can this ever be false? */
239 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
240 struct lustre_handle lockh;
241 struct inode *inode = file->f_dentry->d_inode;
242 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
244 mutex_lock(&lli->lli_och_mutex);
245 if (fd->fd_omode & FMODE_WRITE) {
247 LASSERT(lli->lli_open_fd_write_count);
248 lli->lli_open_fd_write_count--;
249 } else if (fd->fd_omode & FMODE_EXEC) {
251 LASSERT(lli->lli_open_fd_exec_count);
252 lli->lli_open_fd_exec_count--;
255 LASSERT(lli->lli_open_fd_read_count);
256 lli->lli_open_fd_read_count--;
258 mutex_unlock(&lli->lli_och_mutex);
260 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
261 LDLM_IBITS, &policy, lockmode,
263 rc = ll_md_real_close(file->f_dentry->d_inode,
267 CERROR("Releasing a file %p with negative dentry %p. Name %s",
268 file, file->f_dentry, file->f_dentry->d_name.name);
271 LUSTRE_FPRIVATE(file) = NULL;
272 ll_file_data_put(fd);
273 ll_capa_close(inode);
278 /* While this returns an error code, fput() the caller does not, so we need
279 * to make every effort to clean up all of our state here. Also, applications
280 * rarely check close errors and even if an error is returned they will not
281 * re-try the close call.
283 int ll_file_release(struct inode *inode, struct file *file)
285 struct ll_file_data *fd;
286 struct ll_sb_info *sbi = ll_i2sbi(inode);
287 struct ll_inode_info *lli = ll_i2info(inode);
291 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
292 inode->i_generation, inode);
294 #ifdef CONFIG_FS_POSIX_ACL
295 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
296 inode == inode->i_sb->s_root->d_inode) {
297 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
300 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
301 fd->fd_flags &= ~LL_FILE_RMTACL;
302 rct_del(&sbi->ll_rct, cfs_curproc_pid());
303 et_search_free(&sbi->ll_et, cfs_curproc_pid());
308 if (inode->i_sb->s_root != file->f_dentry)
309 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
310 fd = LUSTRE_FPRIVATE(file);
313 /* The last ref on @file, maybe not the the owner pid of statahead.
314 * Different processes can open the same dir, "ll_opendir_key" means:
315 * it is me that should stop the statahead thread. */
316 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
317 lli->lli_opendir_pid != 0)
318 ll_stop_statahead(inode, lli->lli_opendir_key);
320 if (inode->i_sb->s_root == file->f_dentry) {
321 LUSTRE_FPRIVATE(file) = NULL;
322 ll_file_data_put(fd);
326 if (!S_ISDIR(inode->i_mode)) {
327 lov_read_and_clear_async_rc(lli->lli_clob);
328 lli->lli_async_rc = 0;
331 rc = ll_md_close(sbi->ll_md_exp, inode, file);
333 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
334 libcfs_debug_dumplog();
339 static int ll_intent_file_open(struct file *file, void *lmm,
340 int lmmsize, struct lookup_intent *itp)
342 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
343 struct dentry *parent = file->f_dentry->d_parent;
344 const char *name = file->f_dentry->d_name.name;
345 const int len = file->f_dentry->d_name.len;
346 struct md_op_data *op_data;
347 struct ptlrpc_request *req;
348 __u32 opc = LUSTRE_OPC_ANY;
355 /* Usually we come here only for NFSD, and we want open lock.
356 But we can also get here with pre 2.6.15 patchless kernels, and in
357 that case that lock is also ok */
358 /* We can also get here if there was cached open handle in revalidate_it
359 * but it disappeared while we were getting from there to ll_file_open.
360 * But this means this file was closed and immediatelly opened which
361 * makes a good candidate for using OPEN lock */
362 /* If lmmsize & lmm are not 0, we are just setting stripe info
363 * parameters. No need for the open lock */
364 if (lmm == NULL && lmmsize == 0) {
365 itp->it_flags |= MDS_OPEN_LOCK;
366 if (itp->it_flags & FMODE_WRITE)
367 opc = LUSTRE_OPC_CREATE;
370 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
371 file->f_dentry->d_inode, name, len,
374 RETURN(PTR_ERR(op_data));
376 itp->it_flags |= MDS_OPEN_BY_FID;
377 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
378 0 /*unused */, &req, ll_md_blocking_ast, 0);
379 ll_finish_md_op_data(op_data);
381 /* reason for keep own exit path - don`t flood log
382 * with messages with -ESTALE errors.
384 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
385 it_open_error(DISP_OPEN_OPEN, itp))
387 ll_release_openhandle(file->f_dentry, itp);
391 if (it_disposition(itp, DISP_LOOKUP_NEG))
392 GOTO(out, rc = -ENOENT);
394 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
395 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
396 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
400 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
401 if (!rc && itp->d.lustre.it_lock_mode)
402 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
406 ptlrpc_req_finished(itp->d.lustre.it_data);
407 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
408 ll_intent_drop_lock(itp);
414 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
415 * not believe attributes if a few ioepoch holders exist. Attributes for
416 * previous ioepoch if new one is opened are also skipped by MDS.
418 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
420 if (ioepoch && lli->lli_ioepoch != ioepoch) {
421 lli->lli_ioepoch = ioepoch;
422 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
423 ioepoch, PFID(&lli->lli_fid));
427 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
428 struct lookup_intent *it, struct obd_client_handle *och)
430 struct ptlrpc_request *req = it->d.lustre.it_data;
431 struct mdt_body *body;
435 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
436 LASSERT(body != NULL); /* reply already checked out */
438 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
439 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
440 och->och_fid = lli->lli_fid;
441 och->och_flags = it->it_flags;
442 ll_ioepoch_open(lli, body->ioepoch);
444 return md_set_open_replay_data(md_exp, och, req);
447 int ll_local_open(struct file *file, struct lookup_intent *it,
448 struct ll_file_data *fd, struct obd_client_handle *och)
450 struct inode *inode = file->f_dentry->d_inode;
451 struct ll_inode_info *lli = ll_i2info(inode);
454 LASSERT(!LUSTRE_FPRIVATE(file));
459 struct ptlrpc_request *req = it->d.lustre.it_data;
460 struct mdt_body *body;
463 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
467 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
468 if ((it->it_flags & FMODE_WRITE) &&
469 (body->valid & OBD_MD_FLSIZE))
470 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
471 lli->lli_ioepoch, PFID(&lli->lli_fid));
474 LUSTRE_FPRIVATE(file) = fd;
475 ll_readahead_init(inode, &fd->fd_ras);
476 fd->fd_omode = it->it_flags;
480 /* Open a file, and (for the very first open) create objects on the OSTs at
481 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
482 * creation or open until ll_lov_setstripe() ioctl is called.
484 * If we already have the stripe MD locally then we don't request it in
485 * md_open(), by passing a lmm_size = 0.
487 * It is up to the application to ensure no other processes open this file
488 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
489 * used. We might be able to avoid races of that sort by getting lli_open_sem
490 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
491 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
493 int ll_file_open(struct inode *inode, struct file *file)
495 struct ll_inode_info *lli = ll_i2info(inode);
496 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
497 .it_flags = file->f_flags };
498 struct obd_client_handle **och_p = NULL;
499 __u64 *och_usecount = NULL;
500 struct ll_file_data *fd;
501 int rc = 0, opendir_set = 0;
504 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
505 inode->i_generation, inode, file->f_flags);
507 it = file->private_data; /* XXX: compat macro */
508 file->private_data = NULL; /* prevent ll_local_open assertion */
510 fd = ll_file_data_get();
512 GOTO(out_och_free, rc = -ENOMEM);
515 if (S_ISDIR(inode->i_mode)) {
516 spin_lock(&lli->lli_sa_lock);
517 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
518 lli->lli_opendir_pid == 0) {
519 lli->lli_opendir_key = fd;
520 lli->lli_opendir_pid = cfs_curproc_pid();
523 spin_unlock(&lli->lli_sa_lock);
526 if (inode->i_sb->s_root == file->f_dentry) {
527 LUSTRE_FPRIVATE(file) = fd;
531 if (!it || !it->d.lustre.it_disposition) {
532 /* Convert f_flags into access mode. We cannot use file->f_mode,
533 * because everything but O_ACCMODE mask was stripped from
535 if ((oit.it_flags + 1) & O_ACCMODE)
537 if (file->f_flags & O_TRUNC)
538 oit.it_flags |= FMODE_WRITE;
540 /* kernel only call f_op->open in dentry_open. filp_open calls
541 * dentry_open after call to open_namei that checks permissions.
542 * Only nfsd_open call dentry_open directly without checking
543 * permissions and because of that this code below is safe. */
544 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
545 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
547 /* We do not want O_EXCL here, presumably we opened the file
548 * already? XXX - NFS implications? */
549 oit.it_flags &= ~O_EXCL;
551 /* bug20584, if "it_flags" contains O_CREAT, the file will be
552 * created if necessary, then "IT_CREAT" should be set to keep
553 * consistent with it */
554 if (oit.it_flags & O_CREAT)
555 oit.it_op |= IT_CREAT;
561 /* Let's see if we have file open on MDS already. */
562 if (it->it_flags & FMODE_WRITE) {
563 och_p = &lli->lli_mds_write_och;
564 och_usecount = &lli->lli_open_fd_write_count;
565 } else if (it->it_flags & FMODE_EXEC) {
566 och_p = &lli->lli_mds_exec_och;
567 och_usecount = &lli->lli_open_fd_exec_count;
569 och_p = &lli->lli_mds_read_och;
570 och_usecount = &lli->lli_open_fd_read_count;
573 mutex_lock(&lli->lli_och_mutex);
574 if (*och_p) { /* Open handle is present */
575 if (it_disposition(it, DISP_OPEN_OPEN)) {
576 /* Well, there's extra open request that we do not need,
577 let's close it somehow. This will decref request. */
578 rc = it_open_error(DISP_OPEN_OPEN, it);
580 mutex_unlock(&lli->lli_och_mutex);
581 GOTO(out_openerr, rc);
584 ll_release_openhandle(file->f_dentry, it);
588 rc = ll_local_open(file, it, fd, NULL);
591 mutex_unlock(&lli->lli_och_mutex);
592 GOTO(out_openerr, rc);
595 LASSERT(*och_usecount == 0);
596 if (!it->d.lustre.it_disposition) {
597 /* We cannot just request lock handle now, new ELC code
598 means that one of other OPEN locks for this file
599 could be cancelled, and since blocking ast handler
600 would attempt to grab och_mutex as well, that would
601 result in a deadlock */
602 mutex_unlock(&lli->lli_och_mutex);
603 it->it_create_mode |= M_CHECK_STALE;
604 rc = ll_intent_file_open(file, NULL, 0, it);
605 it->it_create_mode &= ~M_CHECK_STALE;
607 GOTO(out_openerr, rc);
611 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
613 GOTO(out_och_free, rc = -ENOMEM);
617 /* md_intent_lock() didn't get a request ref if there was an
618 * open error, so don't do cleanup on the request here
620 /* XXX (green): Should not we bail out on any error here, not
621 * just open error? */
622 rc = it_open_error(DISP_OPEN_OPEN, it);
624 GOTO(out_och_free, rc);
626 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
628 rc = ll_local_open(file, it, fd, *och_p);
630 GOTO(out_och_free, rc);
632 mutex_unlock(&lli->lli_och_mutex);
635 /* Must do this outside lli_och_mutex lock to prevent deadlock where
636 different kind of OPEN lock for this same inode gets cancelled
637 by ldlm_cancel_lru */
638 if (!S_ISREG(inode->i_mode))
639 GOTO(out_och_free, rc);
643 if (!lli->lli_has_smd) {
644 if (file->f_flags & O_LOV_DELAY_CREATE ||
645 !(file->f_mode & FMODE_WRITE)) {
646 CDEBUG(D_INODE, "object creation was delayed\n");
647 GOTO(out_och_free, rc);
650 file->f_flags &= ~O_LOV_DELAY_CREATE;
651 GOTO(out_och_free, rc);
655 if (och_p && *och_p) {
656 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
657 *och_p = NULL; /* OBD_FREE writes some magic there */
660 mutex_unlock(&lli->lli_och_mutex);
663 if (opendir_set != 0)
664 ll_stop_statahead(inode, lli->lli_opendir_key);
666 ll_file_data_put(fd);
668 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
671 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
672 ptlrpc_req_finished(it->d.lustre.it_data);
673 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
679 /* Fills the obdo with the attributes for the lsm */
680 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
681 struct obd_capa *capa, struct obdo *obdo,
682 __u64 ioepoch, int sync)
684 struct ptlrpc_request_set *set;
685 struct obd_info oinfo = { { { 0 } } };
690 LASSERT(lsm != NULL);
694 oinfo.oi_oa->o_id = lsm->lsm_object_id;
695 oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
696 oinfo.oi_oa->o_mode = S_IFREG;
697 oinfo.oi_oa->o_ioepoch = ioepoch;
698 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
699 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
700 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
701 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
702 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
703 OBD_MD_FLDATAVERSION;
704 oinfo.oi_capa = capa;
706 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
707 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
710 set = ptlrpc_prep_set();
712 CERROR("can't allocate ptlrpc set\n");
715 rc = obd_getattr_async(exp, &oinfo, set);
717 rc = ptlrpc_set_wait(set);
718 ptlrpc_set_destroy(set);
721 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
722 OBD_MD_FLATIME | OBD_MD_FLMTIME |
723 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
724 OBD_MD_FLDATAVERSION);
729 * Performs the getattr on the inode and updates its fields.
730 * If @sync != 0, perform the getattr under the server-side lock.
732 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
733 __u64 ioepoch, int sync)
735 struct obd_capa *capa = ll_mdscapa_get(inode);
736 struct lov_stripe_md *lsm;
740 lsm = ccc_inode_lsm_get(inode);
741 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
742 capa, obdo, ioepoch, sync);
745 obdo_refresh_inode(inode, obdo, obdo->o_valid);
747 "objid "LPX64" size %llu, blocks %llu, blksize %lu\n",
748 lsm ? lsm->lsm_object_id : 0, i_size_read(inode),
749 (unsigned long long)inode->i_blocks,
750 (unsigned long)ll_inode_blksize(inode));
752 ccc_inode_lsm_put(inode, lsm);
756 int ll_merge_lvb(struct inode *inode)
758 struct ll_inode_info *lli = ll_i2info(inode);
759 struct ll_sb_info *sbi = ll_i2sbi(inode);
760 struct lov_stripe_md *lsm;
766 lsm = ccc_inode_lsm_get(inode);
767 ll_inode_size_lock(inode);
768 inode_init_lvb(inode, &lvb);
770 /* merge timestamps the most resently obtained from mds with
771 timestamps obtained from osts */
772 lvb.lvb_atime = lli->lli_lvb.lvb_atime;
773 lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
774 lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
776 rc = obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
777 cl_isize_write_nolock(inode, lvb.lvb_size);
779 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
780 PFID(&lli->lli_fid), lvb.lvb_size);
781 inode->i_blocks = lvb.lvb_blocks;
783 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
784 LTIME_S(inode->i_atime) = lvb.lvb_atime;
785 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
786 ll_inode_size_unlock(inode);
787 ccc_inode_lsm_put(inode, lsm);
792 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
795 struct obdo obdo = { 0 };
798 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
800 st->st_size = obdo.o_size;
801 st->st_blocks = obdo.o_blocks;
802 st->st_mtime = obdo.o_mtime;
803 st->st_atime = obdo.o_atime;
804 st->st_ctime = obdo.o_ctime;
809 void ll_io_init(struct cl_io *io, const struct file *file, int write)
811 struct inode *inode = file->f_dentry->d_inode;
813 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
815 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
816 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC || IS_SYNC(inode);
818 io->ci_obj = ll_i2info(inode)->lli_clob;
819 io->ci_lockreq = CILR_MAYBE;
820 if (ll_file_nolock(file)) {
821 io->ci_lockreq = CILR_NEVER;
822 io->ci_no_srvlock = 1;
823 } else if (file->f_flags & O_APPEND) {
824 io->ci_lockreq = CILR_MANDATORY;
829 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
830 struct file *file, enum cl_io_type iot,
831 loff_t *ppos, size_t count)
833 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
834 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
839 io = ccc_env_thread_io(env);
840 ll_io_init(io, file, iot == CIT_WRITE);
842 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
843 struct vvp_io *vio = vvp_env_io(env);
844 struct ccc_io *cio = ccc_env_io(env);
845 int write_mutex_locked = 0;
847 cio->cui_fd = LUSTRE_FPRIVATE(file);
848 vio->cui_io_subtype = args->via_io_subtype;
850 switch (vio->cui_io_subtype) {
852 cio->cui_iov = args->u.normal.via_iov;
853 cio->cui_nrsegs = args->u.normal.via_nrsegs;
854 cio->cui_tot_nrsegs = cio->cui_nrsegs;
855 #ifndef HAVE_FILE_WRITEV
856 cio->cui_iocb = args->u.normal.via_iocb;
858 if ((iot == CIT_WRITE) &&
859 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
860 if (mutex_lock_interruptible(&lli->
862 GOTO(out, result = -ERESTARTSYS);
863 write_mutex_locked = 1;
864 } else if (iot == CIT_READ) {
865 down_read(&lli->lli_trunc_sem);
869 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
870 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
873 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
874 vio->u.splice.cui_flags = args->u.splice.via_flags;
877 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
880 result = cl_io_loop(env, io);
881 if (write_mutex_locked)
882 mutex_unlock(&lli->lli_write_mutex);
883 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
884 up_read(&lli->lli_trunc_sem);
886 /* cl_io_rw_init() handled IO */
887 result = io->ci_result;
890 if (io->ci_nob > 0) {
892 *ppos = io->u.ci_wr.wr.crw_pos;
898 if (iot == CIT_READ) {
900 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
901 LPROC_LL_READ_BYTES, result);
902 } else if (iot == CIT_WRITE) {
904 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
905 LPROC_LL_WRITE_BYTES, result);
906 fd->fd_write_failed = false;
908 fd->fd_write_failed = true;
917 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
919 static int ll_file_get_iov_count(const struct iovec *iov,
920 unsigned long *nr_segs, size_t *count)
925 for (seg = 0; seg < *nr_segs; seg++) {
926 const struct iovec *iv = &iov[seg];
929 * If any segment has a negative length, or the cumulative
930 * length ever wraps negative then return -EINVAL.
933 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
935 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
940 cnt -= iv->iov_len; /* This segment is no good */
947 #ifdef HAVE_FILE_READV
948 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
949 unsigned long nr_segs, loff_t *ppos)
952 struct vvp_io_args *args;
958 result = ll_file_get_iov_count(iov, &nr_segs, &count);
962 env = cl_env_get(&refcheck);
964 RETURN(PTR_ERR(env));
966 args = vvp_env_args(env, IO_NORMAL);
967 args->u.normal.via_iov = (struct iovec *)iov;
968 args->u.normal.via_nrsegs = nr_segs;
970 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
971 cl_env_put(env, &refcheck);
975 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
979 struct iovec *local_iov;
984 env = cl_env_get(&refcheck);
986 RETURN(PTR_ERR(env));
988 local_iov = &vvp_env_info(env)->vti_local_iov;
989 local_iov->iov_base = (void __user *)buf;
990 local_iov->iov_len = count;
991 result = ll_file_readv(file, local_iov, 1, ppos);
992 cl_env_put(env, &refcheck);
997 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
998 unsigned long nr_segs, loff_t pos)
1001 struct vvp_io_args *args;
1007 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1011 env = cl_env_get(&refcheck);
1013 RETURN(PTR_ERR(env));
1015 args = vvp_env_args(env, IO_NORMAL);
1016 args->u.normal.via_iov = (struct iovec *)iov;
1017 args->u.normal.via_nrsegs = nr_segs;
1018 args->u.normal.via_iocb = iocb;
1020 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1021 &iocb->ki_pos, count);
1022 cl_env_put(env, &refcheck);
1026 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1030 struct iovec *local_iov;
1031 struct kiocb *kiocb;
1036 env = cl_env_get(&refcheck);
1038 RETURN(PTR_ERR(env));
1040 local_iov = &vvp_env_info(env)->vti_local_iov;
1041 kiocb = &vvp_env_info(env)->vti_kiocb;
1042 local_iov->iov_base = (void __user *)buf;
1043 local_iov->iov_len = count;
1044 init_sync_kiocb(kiocb, file);
1045 kiocb->ki_pos = *ppos;
1046 kiocb->ki_left = count;
1048 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1049 *ppos = kiocb->ki_pos;
1051 cl_env_put(env, &refcheck);
1057 * Write to a file (through the page cache).
1059 #ifdef HAVE_FILE_WRITEV
1060 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1061 unsigned long nr_segs, loff_t *ppos)
1064 struct vvp_io_args *args;
1070 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1074 env = cl_env_get(&refcheck);
1076 RETURN(PTR_ERR(env));
1078 args = vvp_env_args(env, IO_NORMAL);
1079 args->u.normal.via_iov = (struct iovec *)iov;
1080 args->u.normal.via_nrsegs = nr_segs;
1082 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1083 cl_env_put(env, &refcheck);
1087 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1091 struct iovec *local_iov;
1096 env = cl_env_get(&refcheck);
1098 RETURN(PTR_ERR(env));
1100 local_iov = &vvp_env_info(env)->vti_local_iov;
1101 local_iov->iov_base = (void __user *)buf;
1102 local_iov->iov_len = count;
1104 result = ll_file_writev(file, local_iov, 1, ppos);
1105 cl_env_put(env, &refcheck);
1109 #else /* AIO stuff */
1110 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1111 unsigned long nr_segs, loff_t pos)
1114 struct vvp_io_args *args;
1120 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1124 env = cl_env_get(&refcheck);
1126 RETURN(PTR_ERR(env));
1128 args = vvp_env_args(env, IO_NORMAL);
1129 args->u.normal.via_iov = (struct iovec *)iov;
1130 args->u.normal.via_nrsegs = nr_segs;
1131 args->u.normal.via_iocb = iocb;
1133 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1134 &iocb->ki_pos, count);
1135 cl_env_put(env, &refcheck);
1139 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1143 struct iovec *local_iov;
1144 struct kiocb *kiocb;
1149 env = cl_env_get(&refcheck);
1151 RETURN(PTR_ERR(env));
1153 local_iov = &vvp_env_info(env)->vti_local_iov;
1154 kiocb = &vvp_env_info(env)->vti_kiocb;
1155 local_iov->iov_base = (void __user *)buf;
1156 local_iov->iov_len = count;
1157 init_sync_kiocb(kiocb, file);
1158 kiocb->ki_pos = *ppos;
1159 kiocb->ki_left = count;
1161 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1162 *ppos = kiocb->ki_pos;
1164 cl_env_put(env, &refcheck);
1170 #ifdef HAVE_KERNEL_SENDFILE
1172 * Send file content (through pagecache) somewhere with helper
1174 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1175 read_actor_t actor, void *target)
1178 struct vvp_io_args *args;
1183 env = cl_env_get(&refcheck);
1185 RETURN(PTR_ERR(env));
1187 args = vvp_env_args(env, IO_SENDFILE);
1188 args->u.sendfile.via_target = target;
1189 args->u.sendfile.via_actor = actor;
1191 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1192 cl_env_put(env, &refcheck);
1197 #ifdef HAVE_KERNEL_SPLICE_READ
1199 * Send file content (through pagecache) somewhere with helper
1201 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1202 struct pipe_inode_info *pipe, size_t count,
1206 struct vvp_io_args *args;
1211 env = cl_env_get(&refcheck);
1213 RETURN(PTR_ERR(env));
1215 args = vvp_env_args(env, IO_SPLICE);
1216 args->u.splice.via_pipe = pipe;
1217 args->u.splice.via_flags = flags;
1219 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1220 cl_env_put(env, &refcheck);
1225 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1228 struct obd_export *exp = ll_i2dtexp(inode);
1229 struct obd_trans_info oti = { 0 };
1230 struct obdo *oa = NULL;
1233 struct lov_stripe_md *lsm = NULL, *lsm2;
1240 lsm = ccc_inode_lsm_get(inode);
1242 GOTO(out, rc = -ENOENT);
1244 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1245 (lsm->lsm_stripe_count));
1247 OBD_ALLOC_LARGE(lsm2, lsm_size);
1249 GOTO(out, rc = -ENOMEM);
1253 oa->o_nlink = ost_idx;
1254 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1255 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1256 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1257 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1258 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1259 memcpy(lsm2, lsm, lsm_size);
1260 ll_inode_size_lock(inode);
1261 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1262 ll_inode_size_unlock(inode);
1264 OBD_FREE_LARGE(lsm2, lsm_size);
1267 ccc_inode_lsm_put(inode, lsm);
1272 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1274 struct ll_recreate_obj ucreat;
1277 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1280 if (cfs_copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1281 sizeof(struct ll_recreate_obj)))
1284 RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1285 ucreat.lrc_ost_idx));
1288 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1295 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1298 if (cfs_copy_from_user(&fid, (struct lu_fid *)arg,
1299 sizeof(struct lu_fid)))
1302 id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1303 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1304 RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1307 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1308 int flags, struct lov_user_md *lum, int lum_size)
1310 struct lov_stripe_md *lsm = NULL;
1311 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1315 lsm = ccc_inode_lsm_get(inode);
1317 ccc_inode_lsm_put(inode, lsm);
1318 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1323 ll_inode_size_lock(inode);
1324 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1327 rc = oit.d.lustre.it_status;
1329 GOTO(out_req_free, rc);
1331 ll_release_openhandle(file->f_dentry, &oit);
1334 ll_inode_size_unlock(inode);
1335 ll_intent_release(&oit);
1336 ccc_inode_lsm_put(inode, lsm);
1339 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1343 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1344 struct lov_mds_md **lmmp, int *lmm_size,
1345 struct ptlrpc_request **request)
1347 struct ll_sb_info *sbi = ll_i2sbi(inode);
1348 struct mdt_body *body;
1349 struct lov_mds_md *lmm = NULL;
1350 struct ptlrpc_request *req = NULL;
1351 struct md_op_data *op_data;
1354 rc = ll_get_max_mdsize(sbi, &lmmsize);
1358 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1359 strlen(filename), lmmsize,
1360 LUSTRE_OPC_ANY, NULL);
1361 if (IS_ERR(op_data))
1362 RETURN(PTR_ERR(op_data));
1364 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1365 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1366 ll_finish_md_op_data(op_data);
1368 CDEBUG(D_INFO, "md_getattr_name failed "
1369 "on %s: rc %d\n", filename, rc);
1373 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1374 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1376 lmmsize = body->eadatasize;
1378 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1380 GOTO(out, rc = -ENODATA);
1383 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1384 LASSERT(lmm != NULL);
1386 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1387 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1388 GOTO(out, rc = -EPROTO);
1392 * This is coming from the MDS, so is probably in
1393 * little endian. We convert it to host endian before
1394 * passing it to userspace.
1396 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1397 /* if function called for directory - we should
1398 * avoid swab not existent lsm objects */
1399 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1400 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1401 if (S_ISREG(body->mode))
1402 lustre_swab_lov_user_md_objects(
1403 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1404 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1405 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1406 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1407 if (S_ISREG(body->mode))
1408 lustre_swab_lov_user_md_objects(
1409 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1410 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1416 *lmm_size = lmmsize;
1421 static int ll_lov_setea(struct inode *inode, struct file *file,
1424 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1425 struct lov_user_md *lump;
1426 int lum_size = sizeof(struct lov_user_md) +
1427 sizeof(struct lov_user_ost_data);
1431 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1434 OBD_ALLOC_LARGE(lump, lum_size);
1438 if (cfs_copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1439 OBD_FREE_LARGE(lump, lum_size);
1443 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1445 OBD_FREE_LARGE(lump, lum_size);
1449 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1452 struct lov_user_md_v3 lumv3;
1453 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1454 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1455 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1458 int flags = FMODE_WRITE;
1461 /* first try with v1 which is smaller than v3 */
1462 lum_size = sizeof(struct lov_user_md_v1);
1463 if (cfs_copy_from_user(lumv1, lumv1p, lum_size))
1466 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1467 lum_size = sizeof(struct lov_user_md_v3);
1468 if (cfs_copy_from_user(&lumv3, lumv3p, lum_size))
1472 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1474 struct lov_stripe_md *lsm;
1475 put_user(0, &lumv1p->lmm_stripe_count);
1476 lsm = ccc_inode_lsm_get(inode);
1477 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1478 0, lsm, (void *)arg);
1479 ccc_inode_lsm_put(inode, lsm);
1484 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1486 struct lov_stripe_md *lsm;
1490 lsm = ccc_inode_lsm_get(inode);
1492 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1494 ccc_inode_lsm_put(inode, lsm);
1498 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1500 struct ll_inode_info *lli = ll_i2info(inode);
1501 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1502 struct ccc_grouplock grouplock;
1506 if (ll_file_nolock(file))
1507 RETURN(-EOPNOTSUPP);
1509 spin_lock(&lli->lli_lock);
1510 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1511 CWARN("group lock already existed with gid %lu\n",
1512 fd->fd_grouplock.cg_gid);
1513 spin_unlock(&lli->lli_lock);
1516 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1517 spin_unlock(&lli->lli_lock);
1519 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1520 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1524 spin_lock(&lli->lli_lock);
1525 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1526 spin_unlock(&lli->lli_lock);
1527 CERROR("another thread just won the race\n");
1528 cl_put_grouplock(&grouplock);
1532 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1533 fd->fd_grouplock = grouplock;
1534 spin_unlock(&lli->lli_lock);
1536 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1540 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1542 struct ll_inode_info *lli = ll_i2info(inode);
1543 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1544 struct ccc_grouplock grouplock;
1547 spin_lock(&lli->lli_lock);
1548 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1549 spin_unlock(&lli->lli_lock);
1550 CWARN("no group lock held\n");
1553 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1555 if (fd->fd_grouplock.cg_gid != arg) {
1556 CWARN("group lock %lu doesn't match current id %lu\n",
1557 arg, fd->fd_grouplock.cg_gid);
1558 spin_unlock(&lli->lli_lock);
1562 grouplock = fd->fd_grouplock;
1563 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1564 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1565 spin_unlock(&lli->lli_lock);
1567 cl_put_grouplock(&grouplock);
1568 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1573 * Close inode open handle
1575 * \param dentry [in] dentry which contains the inode
1576 * \param it [in,out] intent which contains open info and result
1579 * \retval <0 failure
1581 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1583 struct inode *inode = dentry->d_inode;
1584 struct obd_client_handle *och;
1590 /* Root ? Do nothing. */
1591 if (dentry->d_inode->i_sb->s_root == dentry)
1594 /* No open handle to close? Move away */
1595 if (!it_disposition(it, DISP_OPEN_OPEN))
1598 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1600 OBD_ALLOC(och, sizeof(*och));
1602 GOTO(out, rc = -ENOMEM);
1604 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1605 ll_i2info(inode), it, och);
1607 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1610 /* this one is in place of ll_file_open */
1611 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1612 ptlrpc_req_finished(it->d.lustre.it_data);
1613 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1619 * Get size for inode for which FIEMAP mapping is requested.
1620 * Make the FIEMAP get_info call and returns the result.
1622 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1625 struct obd_export *exp = ll_i2dtexp(inode);
1626 struct lov_stripe_md *lsm = NULL;
1627 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1628 int vallen = num_bytes;
1632 /* Checks for fiemap flags */
1633 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1634 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1638 /* Check for FIEMAP_FLAG_SYNC */
1639 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1640 rc = filemap_fdatawrite(inode->i_mapping);
1645 lsm = ccc_inode_lsm_get(inode);
1649 /* If the stripe_count > 1 and the application does not understand
1650 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1652 if (lsm->lsm_stripe_count > 1 &&
1653 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1654 GOTO(out, rc = -EOPNOTSUPP);
1656 fm_key.oa.o_id = lsm->lsm_object_id;
1657 fm_key.oa.o_seq = lsm->lsm_object_seq;
1658 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1660 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1661 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1662 /* If filesize is 0, then there would be no objects for mapping */
1663 if (fm_key.oa.o_size == 0) {
1664 fiemap->fm_mapped_extents = 0;
1668 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1670 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1673 CERROR("obd_get_info failed: rc = %d\n", rc);
1676 ccc_inode_lsm_put(inode, lsm);
1680 int ll_fid2path(struct inode *inode, void *arg)
1682 struct obd_export *exp = ll_i2mdexp(inode);
1683 struct getinfo_fid2path *gfout, *gfin;
1687 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1688 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1691 /* Need to get the buflen */
1692 OBD_ALLOC_PTR(gfin);
1695 if (cfs_copy_from_user(gfin, arg, sizeof(*gfin))) {
1700 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1701 OBD_ALLOC(gfout, outsize);
1702 if (gfout == NULL) {
1706 memcpy(gfout, gfin, sizeof(*gfout));
1709 /* Call mdc_iocontrol */
1710 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1713 if (cfs_copy_to_user(arg, gfout, outsize))
1717 OBD_FREE(gfout, outsize);
1721 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1723 struct ll_user_fiemap *fiemap_s;
1724 size_t num_bytes, ret_bytes;
1725 unsigned int extent_count;
1728 /* Get the extent count so we can calculate the size of
1729 * required fiemap buffer */
1730 if (get_user(extent_count,
1731 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1733 num_bytes = sizeof(*fiemap_s) + (extent_count *
1734 sizeof(struct ll_fiemap_extent));
1736 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1737 if (fiemap_s == NULL)
1740 /* get the fiemap value */
1741 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1743 GOTO(error, rc = -EFAULT);
1745 /* If fm_extent_count is non-zero, read the first extent since
1746 * it is used to calculate end_offset and device from previous
1749 if (copy_from_user(&fiemap_s->fm_extents[0],
1750 (char __user *)arg + sizeof(*fiemap_s),
1751 sizeof(struct ll_fiemap_extent)))
1752 GOTO(error, rc = -EFAULT);
1755 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1759 ret_bytes = sizeof(struct ll_user_fiemap);
1761 if (extent_count != 0)
1762 ret_bytes += (fiemap_s->fm_mapped_extents *
1763 sizeof(struct ll_fiemap_extent));
1765 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1769 OBD_FREE_LARGE(fiemap_s, num_bytes);
1774 * Read the data_version for inode.
1776 * This value is computed using stripe object version on OST.
1777 * Version is computed using server side locking.
1779 * @param extent_lock Take extent lock. Not needed if a process is already
1780 * holding the OST object group locks.
1782 static int ll_data_version(struct inode *inode, __u64 *data_version,
1785 struct lov_stripe_md *lsm = NULL;
1786 struct ll_sb_info *sbi = ll_i2sbi(inode);
1787 struct obdo *obdo = NULL;
1791 /* If no stripe, we consider version is 0. */
1792 lsm = ccc_inode_lsm_get(inode);
1795 CDEBUG(D_INODE, "No object for inode\n");
1799 OBD_ALLOC_PTR(obdo);
1801 ccc_inode_lsm_put(inode, lsm);
1805 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1807 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1810 *data_version = obdo->o_data_version;
1814 ccc_inode_lsm_put(inode, lsm);
1819 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1821 struct inode *inode = file->f_dentry->d_inode;
1822 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1827 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1828 inode->i_generation, inode, cmd);
1829 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1831 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1832 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1836 case LL_IOC_GETFLAGS:
1837 /* Get the current value of the file flags */
1838 return put_user(fd->fd_flags, (int *)arg);
1839 case LL_IOC_SETFLAGS:
1840 case LL_IOC_CLRFLAGS:
1841 /* Set or clear specific file flags */
1842 /* XXX This probably needs checks to ensure the flags are
1843 * not abused, and to handle any flag side effects.
1845 if (get_user(flags, (int *) arg))
1848 if (cmd == LL_IOC_SETFLAGS) {
1849 if ((flags & LL_FILE_IGNORE_LOCK) &&
1850 !(file->f_flags & O_DIRECT)) {
1851 CERROR("%s: unable to disable locking on "
1852 "non-O_DIRECT file\n", current->comm);
1856 fd->fd_flags |= flags;
1858 fd->fd_flags &= ~flags;
1861 case LL_IOC_LOV_SETSTRIPE:
1862 RETURN(ll_lov_setstripe(inode, file, arg));
1863 case LL_IOC_LOV_SETEA:
1864 RETURN(ll_lov_setea(inode, file, arg));
1865 case LL_IOC_LOV_GETSTRIPE:
1866 RETURN(ll_lov_getstripe(inode, arg));
1867 case LL_IOC_RECREATE_OBJ:
1868 RETURN(ll_lov_recreate_obj(inode, arg));
1869 case LL_IOC_RECREATE_FID:
1870 RETURN(ll_lov_recreate_fid(inode, arg));
1871 case FSFILT_IOC_FIEMAP:
1872 RETURN(ll_ioctl_fiemap(inode, arg));
1873 case FSFILT_IOC_GETFLAGS:
1874 case FSFILT_IOC_SETFLAGS:
1875 RETURN(ll_iocontrol(inode, file, cmd, arg));
1876 case FSFILT_IOC_GETVERSION_OLD:
1877 case FSFILT_IOC_GETVERSION:
1878 RETURN(put_user(inode->i_generation, (int *)arg));
1879 case LL_IOC_GROUP_LOCK:
1880 RETURN(ll_get_grouplock(inode, file, arg));
1881 case LL_IOC_GROUP_UNLOCK:
1882 RETURN(ll_put_grouplock(inode, file, arg));
1883 case IOC_OBD_STATFS:
1884 RETURN(ll_obd_statfs(inode, (void *)arg));
1886 /* We need to special case any other ioctls we want to handle,
1887 * to send them to the MDS/OST as appropriate and to properly
1888 * network encode the arg field.
1889 case FSFILT_IOC_SETVERSION_OLD:
1890 case FSFILT_IOC_SETVERSION:
1892 case LL_IOC_FLUSHCTX:
1893 RETURN(ll_flush_ctx(inode));
1894 case LL_IOC_PATH2FID: {
1895 if (cfs_copy_to_user((void *)arg, ll_inode2fid(inode),
1896 sizeof(struct lu_fid)))
1901 case OBD_IOC_FID2PATH:
1902 RETURN(ll_fid2path(inode, (void *)arg));
1903 case LL_IOC_DATA_VERSION: {
1904 struct ioc_data_version idv;
1907 if (cfs_copy_from_user(&idv, (char *)arg, sizeof(idv)))
1910 rc = ll_data_version(inode, &idv.idv_version,
1911 !(idv.idv_flags & LL_DV_NOFLUSH));
1914 cfs_copy_to_user((char *) arg, &idv, sizeof(idv)))
1920 case LL_IOC_GET_MDTIDX: {
1923 mdtidx = ll_get_mdt_idx(inode);
1927 if (put_user((int)mdtidx, (int*)arg))
1932 case OBD_IOC_GETDTNAME:
1933 case OBD_IOC_GETMDNAME:
1934 RETURN(ll_get_obd_name(inode, cmd, arg));
1939 ll_iocontrol_call(inode, file, cmd, arg, &err))
1942 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1948 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1950 struct inode *inode = file->f_dentry->d_inode;
1953 retval = offset + ((origin == 2) ? i_size_read(inode) :
1954 (origin == 1) ? file->f_pos : 0);
1955 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%s)\n",
1956 inode->i_ino, inode->i_generation, inode, retval, retval,
1957 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1958 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1960 if (origin == 2) { /* SEEK_END */
1963 rc = ll_glimpse_size(inode);
1967 offset += i_size_read(inode);
1968 } else if (origin == 1) { /* SEEK_CUR */
1969 offset += file->f_pos;
1973 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1974 if (offset != file->f_pos) {
1975 file->f_pos = offset;
1983 int ll_flush(struct file *file, fl_owner_t id)
1985 struct inode *inode = file->f_dentry->d_inode;
1986 struct ll_inode_info *lli = ll_i2info(inode);
1987 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1990 LASSERT(!S_ISDIR(inode->i_mode));
1992 /* catch async errors that were recorded back when async writeback
1993 * failed for pages in this mapping. */
1994 rc = lli->lli_async_rc;
1995 lli->lli_async_rc = 0;
1996 err = lov_read_and_clear_async_rc(lli->lli_clob);
2000 /* The application has been told write failure already.
2001 * Do not report failure again. */
2002 if (fd->fd_write_failed)
2004 return rc ? -EIO : 0;
2008 * Called to make sure a portion of file has been written out.
2009 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2011 * Return how many pages have been written.
2013 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2014 enum cl_fsync_mode mode)
2016 struct cl_env_nest nest;
2019 struct obd_capa *capa = NULL;
2020 struct cl_fsync_io *fio;
2024 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2025 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2028 env = cl_env_nested_get(&nest);
2030 RETURN(PTR_ERR(env));
2032 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2034 io = ccc_env_thread_io(env);
2035 io->ci_obj = cl_i2info(inode)->lli_clob;
2036 io->ci_ignore_layout = 1;
2038 /* initialize parameters for sync */
2039 fio = &io->u.ci_fsync;
2040 fio->fi_capa = capa;
2041 fio->fi_start = start;
2043 fio->fi_fid = ll_inode2fid(inode);
2044 fio->fi_mode = mode;
2045 fio->fi_nr_written = 0;
2047 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2048 result = cl_io_loop(env, io);
2050 result = io->ci_result;
2052 result = fio->fi_nr_written;
2053 cl_io_fini(env, io);
2054 cl_env_nested_put(&nest, env);
2061 #ifdef HAVE_FILE_FSYNC_4ARGS
2062 int ll_fsync(struct file *file, loff_t start, loff_t end, int data)
2063 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2064 int ll_fsync(struct file *file, int data)
2066 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2069 struct inode *inode = file->f_dentry->d_inode;
2070 struct ll_inode_info *lli = ll_i2info(inode);
2071 struct ptlrpc_request *req;
2072 struct obd_capa *oc;
2076 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2077 inode->i_generation, inode);
2078 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2080 #ifdef HAVE_FILE_FSYNC_4ARGS
2081 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2082 mutex_lock(&inode->i_mutex);
2084 /* fsync's caller has already called _fdata{sync,write}, we want
2085 * that IO to finish before calling the osc and mdc sync methods */
2086 rc = filemap_fdatawait(inode->i_mapping);
2089 /* catch async errors that were recorded back when async writeback
2090 * failed for pages in this mapping. */
2091 if (!S_ISDIR(inode->i_mode)) {
2092 err = lli->lli_async_rc;
2093 lli->lli_async_rc = 0;
2096 err = lov_read_and_clear_async_rc(lli->lli_clob);
2101 oc = ll_mdscapa_get(inode);
2102 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2108 ptlrpc_req_finished(req);
2111 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2113 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2115 if (rc == 0 && err < 0)
2118 fd->fd_write_failed = true;
2120 fd->fd_write_failed = false;
2123 #ifdef HAVE_FILE_FSYNC_4ARGS
2124 mutex_unlock(&inode->i_mutex);
2129 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2131 struct inode *inode = file->f_dentry->d_inode;
2132 struct ll_sb_info *sbi = ll_i2sbi(inode);
2133 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2134 .ei_cb_cp =ldlm_flock_completion_ast,
2135 .ei_cbdata = file_lock };
2136 struct md_op_data *op_data;
2137 struct lustre_handle lockh = {0};
2138 ldlm_policy_data_t flock = {{0}};
2143 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2144 inode->i_ino, file_lock);
2146 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2148 if (file_lock->fl_flags & FL_FLOCK) {
2149 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2150 /* flocks are whole-file locks */
2151 flock.l_flock.end = OFFSET_MAX;
2152 /* For flocks owner is determined by the local file desctiptor*/
2153 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2154 } else if (file_lock->fl_flags & FL_POSIX) {
2155 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2156 flock.l_flock.start = file_lock->fl_start;
2157 flock.l_flock.end = file_lock->fl_end;
2161 flock.l_flock.pid = file_lock->fl_pid;
2163 /* Somewhat ugly workaround for svc lockd.
2164 * lockd installs custom fl_lmops->lm_compare_owner that checks
2165 * for the fl_owner to be the same (which it always is on local node
2166 * I guess between lockd processes) and then compares pid.
2167 * As such we assign pid to the owner field to make it all work,
2168 * conflict with normal locks is unlikely since pid space and
2169 * pointer space for current->files are not intersecting */
2170 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2171 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2173 switch (file_lock->fl_type) {
2175 einfo.ei_mode = LCK_PR;
2178 /* An unlock request may or may not have any relation to
2179 * existing locks so we may not be able to pass a lock handle
2180 * via a normal ldlm_lock_cancel() request. The request may even
2181 * unlock a byte range in the middle of an existing lock. In
2182 * order to process an unlock request we need all of the same
2183 * information that is given with a normal read or write record
2184 * lock request. To avoid creating another ldlm unlock (cancel)
2185 * message we'll treat a LCK_NL flock request as an unlock. */
2186 einfo.ei_mode = LCK_NL;
2189 einfo.ei_mode = LCK_PW;
2192 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2193 file_lock->fl_type);
2208 flags = LDLM_FL_BLOCK_NOWAIT;
2214 flags = LDLM_FL_TEST_LOCK;
2215 /* Save the old mode so that if the mode in the lock changes we
2216 * can decrement the appropriate reader or writer refcount. */
2217 file_lock->fl_type = einfo.ei_mode;
2220 CERROR("unknown fcntl lock command: %d\n", cmd);
2224 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2225 LUSTRE_OPC_ANY, NULL);
2226 if (IS_ERR(op_data))
2227 RETURN(PTR_ERR(op_data));
2229 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2230 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2231 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2233 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2234 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2236 ll_finish_md_op_data(op_data);
2238 if ((file_lock->fl_flags & FL_FLOCK) &&
2239 (rc == 0 || file_lock->fl_type == F_UNLCK))
2240 flock_lock_file_wait(file, file_lock);
2241 if ((file_lock->fl_flags & FL_POSIX) &&
2242 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2243 !(flags & LDLM_FL_TEST_LOCK))
2244 posix_lock_file_wait(file, file_lock);
2249 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2257 * test if some locks matching bits and l_req_mode are acquired
2258 * - bits can be in different locks
2259 * - if found clear the common lock bits in *bits
2260 * - the bits not found, are kept in *bits
2262 * \param bits [IN] searched lock bits [IN]
2263 * \param l_req_mode [IN] searched lock mode
2264 * \retval boolean, true iff all bits are found
2266 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2268 struct lustre_handle lockh;
2269 ldlm_policy_data_t policy;
2270 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2271 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2280 fid = &ll_i2info(inode)->lli_fid;
2281 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2282 ldlm_lockname[mode]);
2284 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2285 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2286 policy.l_inodebits.bits = *bits & (1 << i);
2287 if (policy.l_inodebits.bits == 0)
2290 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2291 &policy, mode, &lockh)) {
2292 struct ldlm_lock *lock;
2294 lock = ldlm_handle2lock(&lockh);
2297 ~(lock->l_policy_data.l_inodebits.bits);
2298 LDLM_LOCK_PUT(lock);
2300 *bits &= ~policy.l_inodebits.bits;
2307 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2308 struct lustre_handle *lockh, __u64 flags)
2310 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2315 fid = &ll_i2info(inode)->lli_fid;
2316 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2318 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2319 fid, LDLM_IBITS, &policy,
2320 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2324 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2325 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2326 * and return success */
2328 /* This path cannot be hit for regular files unless in
2329 * case of obscure races, so no need to to validate
2331 if (!S_ISREG(inode->i_mode) &&
2332 !S_ISDIR(inode->i_mode))
2337 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2344 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2347 struct inode *inode = dentry->d_inode;
2348 struct ptlrpc_request *req = NULL;
2349 struct obd_export *exp;
2354 CERROR("REPORT THIS LINE TO PETER\n");
2358 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2359 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2361 exp = ll_i2mdexp(inode);
2363 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2364 * But under CMD case, it caused some lock issues, should be fixed
2365 * with new CMD ibits lock. See bug 12718 */
2366 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2367 struct lookup_intent oit = { .it_op = IT_GETATTR };
2368 struct md_op_data *op_data;
2370 if (ibits == MDS_INODELOCK_LOOKUP)
2371 oit.it_op = IT_LOOKUP;
2373 /* Call getattr by fid, so do not provide name at all. */
2374 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2375 dentry->d_inode, NULL, 0, 0,
2376 LUSTRE_OPC_ANY, NULL);
2377 if (IS_ERR(op_data))
2378 RETURN(PTR_ERR(op_data));
2380 oit.it_create_mode |= M_CHECK_STALE;
2381 rc = md_intent_lock(exp, op_data, NULL, 0,
2382 /* we are not interested in name
2385 ll_md_blocking_ast, 0);
2386 ll_finish_md_op_data(op_data);
2387 oit.it_create_mode &= ~M_CHECK_STALE;
2389 rc = ll_inode_revalidate_fini(inode, rc);
2393 rc = ll_revalidate_it_finish(req, &oit, dentry);
2395 ll_intent_release(&oit);
2399 /* Unlinked? Unhash dentry, so it is not picked up later by
2400 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2401 here to preserve get_cwd functionality on 2.6.
2403 if (!dentry->d_inode->i_nlink)
2404 d_lustre_invalidate(dentry);
2406 ll_lookup_finish_locks(&oit, dentry);
2407 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2408 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2409 obd_valid valid = OBD_MD_FLGETATTR;
2410 struct md_op_data *op_data;
2413 if (S_ISREG(inode->i_mode)) {
2414 rc = ll_get_max_mdsize(sbi, &ealen);
2417 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2420 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2421 0, ealen, LUSTRE_OPC_ANY,
2423 if (IS_ERR(op_data))
2424 RETURN(PTR_ERR(op_data));
2426 op_data->op_valid = valid;
2427 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2428 * capa for this inode. Because we only keep capas of dirs
2430 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2431 ll_finish_md_op_data(op_data);
2433 rc = ll_inode_revalidate_fini(inode, rc);
2437 rc = ll_prep_inode(&inode, req, NULL);
2440 ptlrpc_req_finished(req);
2444 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2447 struct inode *inode = dentry->d_inode;
2451 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2455 /* if object isn't regular file, don't validate size */
2456 if (!S_ISREG(inode->i_mode)) {
2457 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2458 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2459 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2461 rc = ll_glimpse_size(inode);
2466 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2467 struct lookup_intent *it, struct kstat *stat)
2469 struct inode *inode = de->d_inode;
2470 struct ll_sb_info *sbi = ll_i2sbi(inode);
2471 struct ll_inode_info *lli = ll_i2info(inode);
2474 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2475 MDS_INODELOCK_LOOKUP);
2476 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2481 stat->dev = inode->i_sb->s_dev;
2482 if (ll_need_32bit_api(sbi))
2483 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2485 stat->ino = inode->i_ino;
2486 stat->mode = inode->i_mode;
2487 stat->nlink = inode->i_nlink;
2488 stat->uid = inode->i_uid;
2489 stat->gid = inode->i_gid;
2490 stat->rdev = inode->i_rdev;
2491 stat->atime = inode->i_atime;
2492 stat->mtime = inode->i_mtime;
2493 stat->ctime = inode->i_ctime;
2494 stat->blksize = 1 << inode->i_blkbits;
2496 stat->size = i_size_read(inode);
2497 stat->blocks = inode->i_blocks;
2501 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2503 struct lookup_intent it = { .it_op = IT_GETATTR };
2505 return ll_getattr_it(mnt, de, &it, stat);
2508 #ifdef HAVE_LINUX_FIEMAP_H
2509 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2510 __u64 start, __u64 len)
2514 struct ll_user_fiemap *fiemap;
2515 unsigned int extent_count = fieinfo->fi_extents_max;
2517 num_bytes = sizeof(*fiemap) + (extent_count *
2518 sizeof(struct ll_fiemap_extent));
2519 OBD_ALLOC_LARGE(fiemap, num_bytes);
2524 fiemap->fm_flags = fieinfo->fi_flags;
2525 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2526 fiemap->fm_start = start;
2527 fiemap->fm_length = len;
2528 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2529 sizeof(struct ll_fiemap_extent));
2531 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2533 fieinfo->fi_flags = fiemap->fm_flags;
2534 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2535 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2536 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2538 OBD_FREE_LARGE(fiemap, num_bytes);
2543 struct posix_acl * ll_get_acl(struct inode *inode, int type)
2545 struct ll_inode_info *lli = ll_i2info(inode);
2546 struct posix_acl *acl = NULL;
2549 spin_lock(&lli->lli_lock);
2550 /* VFS' acl_permission_check->check_acl will release the refcount */
2551 acl = posix_acl_dup(lli->lli_posix_acl);
2552 spin_unlock(&lli->lli_lock);
2557 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2559 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2560 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
2562 ll_check_acl(struct inode *inode, int mask)
2565 # ifdef CONFIG_FS_POSIX_ACL
2566 struct posix_acl *acl;
2570 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2571 if (flags & IPERM_FLAG_RCU)
2574 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
2579 rc = posix_acl_permission(inode, acl, mask);
2580 posix_acl_release(acl);
2583 # else /* !CONFIG_FS_POSIX_ACL */
2585 # endif /* CONFIG_FS_POSIX_ACL */
2587 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2589 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2590 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2592 # ifdef HAVE_INODE_PERMISION_2ARGS
2593 int ll_inode_permission(struct inode *inode, int mask)
2595 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2602 #ifdef MAY_NOT_BLOCK
2603 if (mask & MAY_NOT_BLOCK)
2605 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
2606 if (flags & IPERM_FLAG_RCU)
2610 /* as root inode are NOT getting validated in lookup operation,
2611 * need to do it before permission check. */
2613 if (inode == inode->i_sb->s_root->d_inode) {
2614 struct lookup_intent it = { .it_op = IT_LOOKUP };
2616 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2617 MDS_INODELOCK_LOOKUP);
2622 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2623 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2625 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2626 return lustre_check_remote_perm(inode, mask);
2628 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2629 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
2634 #ifdef HAVE_FILE_READV
2635 #define READ_METHOD readv
2636 #define READ_FUNCTION ll_file_readv
2637 #define WRITE_METHOD writev
2638 #define WRITE_FUNCTION ll_file_writev
2640 #define READ_METHOD aio_read
2641 #define READ_FUNCTION ll_file_aio_read
2642 #define WRITE_METHOD aio_write
2643 #define WRITE_FUNCTION ll_file_aio_write
2646 /* -o localflock - only provides locally consistent flock locks */
2647 struct file_operations ll_file_operations = {
2648 .read = ll_file_read,
2649 .READ_METHOD = READ_FUNCTION,
2650 .write = ll_file_write,
2651 .WRITE_METHOD = WRITE_FUNCTION,
2652 .unlocked_ioctl = ll_file_ioctl,
2653 .open = ll_file_open,
2654 .release = ll_file_release,
2655 .mmap = ll_file_mmap,
2656 .llseek = ll_file_seek,
2657 #ifdef HAVE_KERNEL_SENDFILE
2658 .sendfile = ll_file_sendfile,
2660 #ifdef HAVE_KERNEL_SPLICE_READ
2661 .splice_read = ll_file_splice_read,
2667 struct file_operations ll_file_operations_flock = {
2668 .read = ll_file_read,
2669 .READ_METHOD = READ_FUNCTION,
2670 .write = ll_file_write,
2671 .WRITE_METHOD = WRITE_FUNCTION,
2672 .unlocked_ioctl = ll_file_ioctl,
2673 .open = ll_file_open,
2674 .release = ll_file_release,
2675 .mmap = ll_file_mmap,
2676 .llseek = ll_file_seek,
2677 #ifdef HAVE_KERNEL_SENDFILE
2678 .sendfile = ll_file_sendfile,
2680 #ifdef HAVE_KERNEL_SPLICE_READ
2681 .splice_read = ll_file_splice_read,
2685 .flock = ll_file_flock,
2686 .lock = ll_file_flock
2689 /* These are for -o noflock - to return ENOSYS on flock calls */
2690 struct file_operations ll_file_operations_noflock = {
2691 .read = ll_file_read,
2692 .READ_METHOD = READ_FUNCTION,
2693 .write = ll_file_write,
2694 .WRITE_METHOD = WRITE_FUNCTION,
2695 .unlocked_ioctl = ll_file_ioctl,
2696 .open = ll_file_open,
2697 .release = ll_file_release,
2698 .mmap = ll_file_mmap,
2699 .llseek = ll_file_seek,
2700 #ifdef HAVE_KERNEL_SENDFILE
2701 .sendfile = ll_file_sendfile,
2703 #ifdef HAVE_KERNEL_SPLICE_READ
2704 .splice_read = ll_file_splice_read,
2708 .flock = ll_file_noflock,
2709 .lock = ll_file_noflock
2712 struct inode_operations ll_file_inode_operations = {
2713 .setattr = ll_setattr,
2714 .getattr = ll_getattr,
2715 .permission = ll_inode_permission,
2716 .setxattr = ll_setxattr,
2717 .getxattr = ll_getxattr,
2718 .listxattr = ll_listxattr,
2719 .removexattr = ll_removexattr,
2720 #ifdef HAVE_LINUX_FIEMAP_H
2721 .fiemap = ll_fiemap,
2723 #ifdef HAVE_IOP_GET_ACL
2724 .get_acl = ll_get_acl,
2728 /* dynamic ioctl number support routins */
2729 static struct llioc_ctl_data {
2730 struct rw_semaphore ioc_sem;
2731 cfs_list_t ioc_head;
2733 __RWSEM_INITIALIZER(llioc.ioc_sem),
2734 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2739 cfs_list_t iocd_list;
2740 unsigned int iocd_size;
2741 llioc_callback_t iocd_cb;
2742 unsigned int iocd_count;
2743 unsigned int iocd_cmd[0];
2746 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2749 struct llioc_data *in_data = NULL;
2752 if (cb == NULL || cmd == NULL ||
2753 count > LLIOC_MAX_CMD || count < 0)
2756 size = sizeof(*in_data) + count * sizeof(unsigned int);
2757 OBD_ALLOC(in_data, size);
2758 if (in_data == NULL)
2761 memset(in_data, 0, sizeof(*in_data));
2762 in_data->iocd_size = size;
2763 in_data->iocd_cb = cb;
2764 in_data->iocd_count = count;
2765 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2767 down_write(&llioc.ioc_sem);
2768 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2769 up_write(&llioc.ioc_sem);
2774 void ll_iocontrol_unregister(void *magic)
2776 struct llioc_data *tmp;
2781 down_write(&llioc.ioc_sem);
2782 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2784 unsigned int size = tmp->iocd_size;
2786 cfs_list_del(&tmp->iocd_list);
2787 up_write(&llioc.ioc_sem);
2789 OBD_FREE(tmp, size);
2793 up_write(&llioc.ioc_sem);
2795 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2798 EXPORT_SYMBOL(ll_iocontrol_register);
2799 EXPORT_SYMBOL(ll_iocontrol_unregister);
2801 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2802 unsigned int cmd, unsigned long arg, int *rcp)
2804 enum llioc_iter ret = LLIOC_CONT;
2805 struct llioc_data *data;
2806 int rc = -EINVAL, i;
2808 down_read(&llioc.ioc_sem);
2809 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2810 for (i = 0; i < data->iocd_count; i++) {
2811 if (cmd != data->iocd_cmd[i])
2814 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2818 if (ret == LLIOC_STOP)
2821 up_read(&llioc.ioc_sem);
2828 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
2830 struct ll_inode_info *lli = ll_i2info(inode);
2831 struct cl_env_nest nest;
2836 if (lli->lli_clob == NULL)
2839 env = cl_env_nested_get(&nest);
2841 RETURN(PTR_ERR(env));
2843 result = cl_conf_set(env, lli->lli_clob, conf);
2844 cl_env_nested_put(&nest, env);
2849 * This function checks if there exists a LAYOUT lock on the client side,
2850 * or enqueues it if it doesn't have one in cache.
2852 * This function will not hold layout lock so it may be revoked any time after
2853 * this function returns. Any operations depend on layout should be redone
2856 * This function should be called before lov_io_init() to get an uptodate
2857 * layout version, the caller should save the version number and after IO
2858 * is finished, this function should be called again to verify that layout
2859 * is not changed during IO time.
2861 int ll_layout_refresh(struct inode *inode, __u32 *gen)
2863 struct ll_inode_info *lli = ll_i2info(inode);
2864 struct ll_sb_info *sbi = ll_i2sbi(inode);
2865 struct md_op_data *op_data = NULL;
2866 struct lookup_intent it = { .it_op = IT_LAYOUT };
2867 struct lustre_handle lockh = { 0 };
2869 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
2871 .ei_cb_bl = ll_md_blocking_ast,
2872 .ei_cb_cp = ldlm_completion_ast,
2873 .ei_cbdata = inode };
2878 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
2882 LASSERT(fid_is_sane(ll_inode2fid(inode)));
2883 LASSERT(S_ISREG(inode->i_mode));
2885 /* mostly layout lock is caching on the local side, so try to match
2886 * it before grabbing layout lock mutex. */
2887 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh,
2889 if (mode != 0) { /* hit cached lock */
2890 /* lsm_layout_gen is started from 0, plus 1 here to distinguish
2891 * the cases of no layout and first layout. */
2892 *gen = lli->lli_layout_gen + 1;
2894 ldlm_lock_decref(&lockh, mode);
2898 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
2899 0, 0, LUSTRE_OPC_ANY, NULL);
2900 if (IS_ERR(op_data))
2901 RETURN(PTR_ERR(op_data));
2903 /* take layout lock mutex to enqueue layout lock exclusively. */
2904 mutex_lock(&lli->lli_layout_mutex);
2906 /* try again inside layout mutex */
2907 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh,
2909 if (mode != 0) { /* hit cached lock */
2910 *gen = lli->lli_layout_gen + 1;
2912 ldlm_lock_decref(&lockh, mode);
2913 mutex_unlock(&lli->lli_layout_mutex);
2914 ll_finish_md_op_data(op_data);
2918 /* have to enqueue one */
2919 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
2921 if (it.d.lustre.it_data != NULL)
2922 ptlrpc_req_finished(it.d.lustre.it_data);
2923 it.d.lustre.it_data = NULL;
2926 struct ldlm_lock *lock;
2927 struct cl_object_conf conf;
2928 struct lustre_md md = { NULL };
2932 LASSERT(lustre_handle_is_used(&lockh));
2934 /* set lock data in case this is a new lock */
2935 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
2937 lock = ldlm_handle2lock(&lockh);
2938 LASSERT(lock != NULL);
2940 /* for IT_LAYOUT lock, lmm is returned in lock's lvb
2941 * data via completion callback */
2942 lmm = lock->l_lvb_data;
2943 lmmsize = lock->l_lvb_len;
2945 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
2949 *gen = md.lsm->lsm_layout_gen + 1;
2952 CERROR("file: "DFID" unpackmd error: %d\n",
2953 PFID(&lli->lli_fid), rc);
2956 LDLM_LOCK_PUT(lock);
2958 /* set layout to file. This may cause lock expiration as we
2959 * set layout inside layout ibits lock. */
2960 memset(&conf, 0, sizeof conf);
2961 conf.coc_inode = inode;
2962 conf.u.coc_md = &md;
2963 ll_layout_conf(inode, &conf);
2965 lli->lli_has_smd = md.lsm != NULL;
2967 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
2969 ll_intent_drop_lock(&it);
2971 mutex_unlock(&lli->lli_layout_mutex);
2972 ll_finish_md_op_data(op_data);