4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Whamcloud, Inc.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
61 static void ll_file_data_put(struct ll_file_data *fd)
64 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
67 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
68 struct lustre_handle *fh)
70 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
71 op_data->op_attr.ia_mode = inode->i_mode;
72 op_data->op_attr.ia_atime = inode->i_atime;
73 op_data->op_attr.ia_mtime = inode->i_mtime;
74 op_data->op_attr.ia_ctime = inode->i_ctime;
75 op_data->op_attr.ia_size = i_size_read(inode);
76 op_data->op_attr_blocks = inode->i_blocks;
77 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
78 ll_inode_to_ext_flags(inode->i_flags);
79 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
81 op_data->op_handle = *fh;
82 op_data->op_capa1 = ll_mdscapa_get(inode);
86 * Closes the IO epoch and packs all the attributes into @op_data for
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
95 ATTR_MTIME_SET | ATTR_CTIME_SET;
97 if (!(och->och_flags & FMODE_WRITE))
100 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
101 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
103 ll_ioepoch_close(inode, op_data, &och, 0);
106 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
107 ll_prep_md_op_data(op_data, inode, NULL, NULL,
108 0, 0, LUSTRE_OPC_ANY, NULL);
112 static int ll_close_inode_openhandle(struct obd_export *md_exp,
114 struct obd_client_handle *och)
116 struct obd_export *exp = ll_i2mdexp(inode);
117 struct md_op_data *op_data;
118 struct ptlrpc_request *req = NULL;
119 struct obd_device *obd = class_exp2obd(exp);
126 * XXX: in case of LMV, is this correct to access
129 CERROR("Invalid MDC connection handle "LPX64"\n",
130 ll_i2mdexp(inode)->exp_handle.h_cookie);
134 OBD_ALLOC_PTR(op_data);
136 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
138 ll_prepare_close(inode, op_data, och);
139 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
140 rc = md_close(md_exp, op_data, och->och_mod, &req);
142 /* This close must have the epoch closed. */
143 LASSERT(epoch_close);
144 /* MDS has instructed us to obtain Size-on-MDS attribute from
145 * OSTs and send setattr to back to MDS. */
146 rc = ll_som_update(inode, op_data);
148 CERROR("inode %lu mdc Size-on-MDS update failed: "
149 "rc = %d\n", inode->i_ino, rc);
153 CERROR("inode %lu mdc close failed: rc = %d\n",
156 ll_finish_md_op_data(op_data);
159 rc = ll_objects_destroy(req, inode);
161 CERROR("inode %lu ll_objects destroy: rc = %d\n",
168 if (exp_connect_som(exp) && !epoch_close &&
169 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
170 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
172 md_clear_open_replay_data(md_exp, och);
173 /* Free @och if it is not waiting for DONE_WRITING. */
174 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
177 if (req) /* This is close request */
178 ptlrpc_req_finished(req);
182 int ll_md_real_close(struct inode *inode, int flags)
184 struct ll_inode_info *lli = ll_i2info(inode);
185 struct obd_client_handle **och_p;
186 struct obd_client_handle *och;
191 if (flags & FMODE_WRITE) {
192 och_p = &lli->lli_mds_write_och;
193 och_usecount = &lli->lli_open_fd_write_count;
194 } else if (flags & FMODE_EXEC) {
195 och_p = &lli->lli_mds_exec_och;
196 och_usecount = &lli->lli_open_fd_exec_count;
198 LASSERT(flags & FMODE_READ);
199 och_p = &lli->lli_mds_read_och;
200 och_usecount = &lli->lli_open_fd_read_count;
203 cfs_mutex_lock(&lli->lli_och_mutex);
204 if (*och_usecount) { /* There are still users of this handle, so
206 cfs_mutex_unlock(&lli->lli_och_mutex);
211 cfs_mutex_unlock(&lli->lli_och_mutex);
213 if (och) { /* There might be a race and somebody have freed this och
215 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
222 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
225 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
226 struct ll_inode_info *lli = ll_i2info(inode);
230 /* clear group lock, if present */
231 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
232 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
234 /* Let's see if we have good enough OPEN lock on the file and if
235 we can skip talking to MDS */
236 if (file->f_dentry->d_inode) { /* Can this ever be false? */
238 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
239 struct lustre_handle lockh;
240 struct inode *inode = file->f_dentry->d_inode;
241 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
243 cfs_mutex_lock(&lli->lli_och_mutex);
244 if (fd->fd_omode & FMODE_WRITE) {
246 LASSERT(lli->lli_open_fd_write_count);
247 lli->lli_open_fd_write_count--;
248 } else if (fd->fd_omode & FMODE_EXEC) {
250 LASSERT(lli->lli_open_fd_exec_count);
251 lli->lli_open_fd_exec_count--;
254 LASSERT(lli->lli_open_fd_read_count);
255 lli->lli_open_fd_read_count--;
257 cfs_mutex_unlock(&lli->lli_och_mutex);
259 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
260 LDLM_IBITS, &policy, lockmode,
262 rc = ll_md_real_close(file->f_dentry->d_inode,
266 CERROR("Releasing a file %p with negative dentry %p. Name %s",
267 file, file->f_dentry, file->f_dentry->d_name.name);
270 LUSTRE_FPRIVATE(file) = NULL;
271 ll_file_data_put(fd);
272 ll_capa_close(inode);
277 /* While this returns an error code, fput() the caller does not, so we need
278 * to make every effort to clean up all of our state here. Also, applications
279 * rarely check close errors and even if an error is returned they will not
280 * re-try the close call.
282 int ll_file_release(struct inode *inode, struct file *file)
284 struct ll_file_data *fd;
285 struct ll_sb_info *sbi = ll_i2sbi(inode);
286 struct ll_inode_info *lli = ll_i2info(inode);
290 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
291 inode->i_generation, inode);
293 #ifdef CONFIG_FS_POSIX_ACL
294 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
295 inode == inode->i_sb->s_root->d_inode) {
296 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
299 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
300 fd->fd_flags &= ~LL_FILE_RMTACL;
301 rct_del(&sbi->ll_rct, cfs_curproc_pid());
302 et_search_free(&sbi->ll_et, cfs_curproc_pid());
307 if (inode->i_sb->s_root != file->f_dentry)
308 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
309 fd = LUSTRE_FPRIVATE(file);
312 /* The last ref on @file, maybe not the the owner pid of statahead.
313 * Different processes can open the same dir, "ll_opendir_key" means:
314 * it is me that should stop the statahead thread. */
315 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
316 lli->lli_opendir_pid != 0)
317 ll_stop_statahead(inode, lli->lli_opendir_key);
319 if (inode->i_sb->s_root == file->f_dentry) {
320 LUSTRE_FPRIVATE(file) = NULL;
321 ll_file_data_put(fd);
325 if (!S_ISDIR(inode->i_mode)) {
326 lov_read_and_clear_async_rc(lli->lli_clob);
327 lli->lli_async_rc = 0;
330 rc = ll_md_close(sbi->ll_md_exp, inode, file);
332 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
333 libcfs_debug_dumplog();
338 static int ll_intent_file_open(struct file *file, void *lmm,
339 int lmmsize, struct lookup_intent *itp)
341 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
342 struct dentry *parent = file->f_dentry->d_parent;
343 const char *name = file->f_dentry->d_name.name;
344 const int len = file->f_dentry->d_name.len;
345 struct md_op_data *op_data;
346 struct ptlrpc_request *req;
347 __u32 opc = LUSTRE_OPC_ANY;
354 /* Usually we come here only for NFSD, and we want open lock.
355 But we can also get here with pre 2.6.15 patchless kernels, and in
356 that case that lock is also ok */
357 /* We can also get here if there was cached open handle in revalidate_it
358 * but it disappeared while we were getting from there to ll_file_open.
359 * But this means this file was closed and immediatelly opened which
360 * makes a good candidate for using OPEN lock */
361 /* If lmmsize & lmm are not 0, we are just setting stripe info
362 * parameters. No need for the open lock */
363 if (lmm == NULL && lmmsize == 0) {
364 itp->it_flags |= MDS_OPEN_LOCK;
365 if (itp->it_flags & FMODE_WRITE)
366 opc = LUSTRE_OPC_CREATE;
369 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
370 file->f_dentry->d_inode, name, len,
373 RETURN(PTR_ERR(op_data));
375 itp->it_flags |= MDS_OPEN_BY_FID;
376 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
377 0 /*unused */, &req, ll_md_blocking_ast, 0);
378 ll_finish_md_op_data(op_data);
380 /* reason for keep own exit path - don`t flood log
381 * with messages with -ESTALE errors.
383 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
384 it_open_error(DISP_OPEN_OPEN, itp))
386 ll_release_openhandle(file->f_dentry, itp);
390 if (it_disposition(itp, DISP_LOOKUP_NEG))
391 GOTO(out, rc = -ENOENT);
393 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
394 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
395 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
399 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
400 if (!rc && itp->d.lustre.it_lock_mode)
401 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
405 ptlrpc_req_finished(itp->d.lustre.it_data);
406 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
407 ll_intent_drop_lock(itp);
413 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
414 * not believe attributes if a few ioepoch holders exist. Attributes for
415 * previous ioepoch if new one is opened are also skipped by MDS.
417 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
419 if (ioepoch && lli->lli_ioepoch != ioepoch) {
420 lli->lli_ioepoch = ioepoch;
421 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
422 ioepoch, PFID(&lli->lli_fid));
426 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
427 struct lookup_intent *it, struct obd_client_handle *och)
429 struct ptlrpc_request *req = it->d.lustre.it_data;
430 struct mdt_body *body;
434 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
435 LASSERT(body != NULL); /* reply already checked out */
437 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
438 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
439 och->och_fid = lli->lli_fid;
440 och->och_flags = it->it_flags;
441 ll_ioepoch_open(lli, body->ioepoch);
443 return md_set_open_replay_data(md_exp, och, req);
446 int ll_local_open(struct file *file, struct lookup_intent *it,
447 struct ll_file_data *fd, struct obd_client_handle *och)
449 struct inode *inode = file->f_dentry->d_inode;
450 struct ll_inode_info *lli = ll_i2info(inode);
453 LASSERT(!LUSTRE_FPRIVATE(file));
458 struct ptlrpc_request *req = it->d.lustre.it_data;
459 struct mdt_body *body;
462 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
466 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
467 if ((it->it_flags & FMODE_WRITE) &&
468 (body->valid & OBD_MD_FLSIZE))
469 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
470 lli->lli_ioepoch, PFID(&lli->lli_fid));
473 LUSTRE_FPRIVATE(file) = fd;
474 ll_readahead_init(inode, &fd->fd_ras);
475 fd->fd_omode = it->it_flags;
479 /* Open a file, and (for the very first open) create objects on the OSTs at
480 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
481 * creation or open until ll_lov_setstripe() ioctl is called.
483 * If we already have the stripe MD locally then we don't request it in
484 * md_open(), by passing a lmm_size = 0.
486 * It is up to the application to ensure no other processes open this file
487 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
488 * used. We might be able to avoid races of that sort by getting lli_open_sem
489 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
490 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
492 int ll_file_open(struct inode *inode, struct file *file)
494 struct ll_inode_info *lli = ll_i2info(inode);
495 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
496 .it_flags = file->f_flags };
497 struct obd_client_handle **och_p = NULL;
498 __u64 *och_usecount = NULL;
499 struct ll_file_data *fd;
500 int rc = 0, opendir_set = 0;
503 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
504 inode->i_generation, inode, file->f_flags);
506 it = file->private_data; /* XXX: compat macro */
507 file->private_data = NULL; /* prevent ll_local_open assertion */
509 fd = ll_file_data_get();
511 GOTO(out_och_free, rc = -ENOMEM);
514 if (S_ISDIR(inode->i_mode)) {
515 cfs_spin_lock(&lli->lli_sa_lock);
516 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
517 lli->lli_opendir_pid == 0) {
518 lli->lli_opendir_key = fd;
519 lli->lli_opendir_pid = cfs_curproc_pid();
522 cfs_spin_unlock(&lli->lli_sa_lock);
525 if (inode->i_sb->s_root == file->f_dentry) {
526 LUSTRE_FPRIVATE(file) = fd;
530 if (!it || !it->d.lustre.it_disposition) {
531 /* Convert f_flags into access mode. We cannot use file->f_mode,
532 * because everything but O_ACCMODE mask was stripped from
534 if ((oit.it_flags + 1) & O_ACCMODE)
536 if (file->f_flags & O_TRUNC)
537 oit.it_flags |= FMODE_WRITE;
539 /* kernel only call f_op->open in dentry_open. filp_open calls
540 * dentry_open after call to open_namei that checks permissions.
541 * Only nfsd_open call dentry_open directly without checking
542 * permissions and because of that this code below is safe. */
543 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
544 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
546 /* We do not want O_EXCL here, presumably we opened the file
547 * already? XXX - NFS implications? */
548 oit.it_flags &= ~O_EXCL;
550 /* bug20584, if "it_flags" contains O_CREAT, the file will be
551 * created if necessary, then "IT_CREAT" should be set to keep
552 * consistent with it */
553 if (oit.it_flags & O_CREAT)
554 oit.it_op |= IT_CREAT;
560 /* Let's see if we have file open on MDS already. */
561 if (it->it_flags & FMODE_WRITE) {
562 och_p = &lli->lli_mds_write_och;
563 och_usecount = &lli->lli_open_fd_write_count;
564 } else if (it->it_flags & FMODE_EXEC) {
565 och_p = &lli->lli_mds_exec_och;
566 och_usecount = &lli->lli_open_fd_exec_count;
568 och_p = &lli->lli_mds_read_och;
569 och_usecount = &lli->lli_open_fd_read_count;
572 cfs_mutex_lock(&lli->lli_och_mutex);
573 if (*och_p) { /* Open handle is present */
574 if (it_disposition(it, DISP_OPEN_OPEN)) {
575 /* Well, there's extra open request that we do not need,
576 let's close it somehow. This will decref request. */
577 rc = it_open_error(DISP_OPEN_OPEN, it);
579 cfs_mutex_unlock(&lli->lli_och_mutex);
580 GOTO(out_openerr, rc);
583 ll_release_openhandle(file->f_dentry, it);
587 rc = ll_local_open(file, it, fd, NULL);
590 cfs_mutex_unlock(&lli->lli_och_mutex);
591 GOTO(out_openerr, rc);
594 LASSERT(*och_usecount == 0);
595 if (!it->d.lustre.it_disposition) {
596 /* We cannot just request lock handle now, new ELC code
597 means that one of other OPEN locks for this file
598 could be cancelled, and since blocking ast handler
599 would attempt to grab och_mutex as well, that would
600 result in a deadlock */
601 cfs_mutex_unlock(&lli->lli_och_mutex);
602 it->it_create_mode |= M_CHECK_STALE;
603 rc = ll_intent_file_open(file, NULL, 0, it);
604 it->it_create_mode &= ~M_CHECK_STALE;
606 GOTO(out_openerr, rc);
610 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
612 GOTO(out_och_free, rc = -ENOMEM);
616 /* md_intent_lock() didn't get a request ref if there was an
617 * open error, so don't do cleanup on the request here
619 /* XXX (green): Should not we bail out on any error here, not
620 * just open error? */
621 rc = it_open_error(DISP_OPEN_OPEN, it);
623 GOTO(out_och_free, rc);
625 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
627 rc = ll_local_open(file, it, fd, *och_p);
629 GOTO(out_och_free, rc);
631 cfs_mutex_unlock(&lli->lli_och_mutex);
634 /* Must do this outside lli_och_mutex lock to prevent deadlock where
635 different kind of OPEN lock for this same inode gets cancelled
636 by ldlm_cancel_lru */
637 if (!S_ISREG(inode->i_mode))
638 GOTO(out_och_free, rc);
642 if (!lli->lli_has_smd) {
643 if (file->f_flags & O_LOV_DELAY_CREATE ||
644 !(file->f_mode & FMODE_WRITE)) {
645 CDEBUG(D_INODE, "object creation was delayed\n");
646 GOTO(out_och_free, rc);
649 file->f_flags &= ~O_LOV_DELAY_CREATE;
650 GOTO(out_och_free, rc);
653 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
654 ptlrpc_req_finished(it->d.lustre.it_data);
655 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
659 if (och_p && *och_p) {
660 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
661 *och_p = NULL; /* OBD_FREE writes some magic there */
664 cfs_mutex_unlock(&lli->lli_och_mutex);
667 if (opendir_set != 0)
668 ll_stop_statahead(inode, lli->lli_opendir_key);
670 ll_file_data_put(fd);
672 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
678 /* Fills the obdo with the attributes for the lsm */
679 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
680 struct obd_capa *capa, struct obdo *obdo,
681 __u64 ioepoch, int sync)
683 struct ptlrpc_request_set *set;
684 struct obd_info oinfo = { { { 0 } } };
689 LASSERT(lsm != NULL);
693 oinfo.oi_oa->o_id = lsm->lsm_object_id;
694 oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
695 oinfo.oi_oa->o_mode = S_IFREG;
696 oinfo.oi_oa->o_ioepoch = ioepoch;
697 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
698 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
699 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
700 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
701 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
702 OBD_MD_FLDATAVERSION;
703 oinfo.oi_capa = capa;
705 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
706 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
709 set = ptlrpc_prep_set();
711 CERROR("can't allocate ptlrpc set\n");
714 rc = obd_getattr_async(exp, &oinfo, set);
716 rc = ptlrpc_set_wait(set);
717 ptlrpc_set_destroy(set);
720 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
721 OBD_MD_FLATIME | OBD_MD_FLMTIME |
722 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
723 OBD_MD_FLDATAVERSION);
728 * Performs the getattr on the inode and updates its fields.
729 * If @sync != 0, perform the getattr under the server-side lock.
731 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
732 __u64 ioepoch, int sync)
734 struct obd_capa *capa = ll_mdscapa_get(inode);
735 struct lov_stripe_md *lsm;
739 lsm = ccc_inode_lsm_get(inode);
740 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
741 capa, obdo, ioepoch, sync);
744 obdo_refresh_inode(inode, obdo, obdo->o_valid);
746 "objid "LPX64" size %llu, blocks %llu, blksize %lu\n",
747 lsm ? lsm->lsm_object_id : 0, i_size_read(inode),
748 (unsigned long long)inode->i_blocks,
749 (unsigned long)ll_inode_blksize(inode));
751 ccc_inode_lsm_put(inode, lsm);
755 int ll_merge_lvb(struct inode *inode)
757 struct ll_inode_info *lli = ll_i2info(inode);
758 struct ll_sb_info *sbi = ll_i2sbi(inode);
759 struct lov_stripe_md *lsm;
765 lsm = ccc_inode_lsm_get(inode);
766 ll_inode_size_lock(inode);
767 inode_init_lvb(inode, &lvb);
769 /* merge timestamps the most resently obtained from mds with
770 timestamps obtained from osts */
771 lvb.lvb_atime = lli->lli_lvb.lvb_atime;
772 lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
773 lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
775 rc = obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
776 cl_isize_write_nolock(inode, lvb.lvb_size);
778 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
779 PFID(&lli->lli_fid), lvb.lvb_size);
780 inode->i_blocks = lvb.lvb_blocks;
782 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
783 LTIME_S(inode->i_atime) = lvb.lvb_atime;
784 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
786 ll_inode_size_unlock(inode);
787 ccc_inode_lsm_put(inode, lsm);
792 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
795 struct obdo obdo = { 0 };
798 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
800 st->st_size = obdo.o_size;
801 st->st_blocks = obdo.o_blocks;
802 st->st_mtime = obdo.o_mtime;
803 st->st_atime = obdo.o_atime;
804 st->st_ctime = obdo.o_ctime;
809 void ll_io_init(struct cl_io *io, const struct file *file, int write)
811 struct inode *inode = file->f_dentry->d_inode;
813 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
815 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
816 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC || IS_SYNC(inode);
818 io->ci_obj = ll_i2info(inode)->lli_clob;
819 io->ci_lockreq = CILR_MAYBE;
820 if (ll_file_nolock(file)) {
821 io->ci_lockreq = CILR_NEVER;
822 io->ci_no_srvlock = 1;
823 } else if (file->f_flags & O_APPEND) {
824 io->ci_lockreq = CILR_MANDATORY;
828 static ssize_t ll_file_io_generic(const struct lu_env *env,
829 struct vvp_io_args *args, struct file *file,
830 enum cl_io_type iot, loff_t *ppos, size_t count)
832 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
837 io = ccc_env_thread_io(env);
838 ll_io_init(io, file, iot == CIT_WRITE);
840 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
841 struct vvp_io *vio = vvp_env_io(env);
842 struct ccc_io *cio = ccc_env_io(env);
843 int write_mutex_locked = 0;
845 cio->cui_fd = LUSTRE_FPRIVATE(file);
846 vio->cui_io_subtype = args->via_io_subtype;
848 switch (vio->cui_io_subtype) {
850 cio->cui_iov = args->u.normal.via_iov;
851 cio->cui_nrsegs = args->u.normal.via_nrsegs;
852 cio->cui_tot_nrsegs = cio->cui_nrsegs;
853 #ifndef HAVE_FILE_WRITEV
854 cio->cui_iocb = args->u.normal.via_iocb;
856 if ((iot == CIT_WRITE) &&
857 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
858 if (cfs_mutex_lock_interruptible(&lli->
860 GOTO(out, result = -ERESTARTSYS);
861 write_mutex_locked = 1;
862 } else if (iot == CIT_READ) {
863 cfs_down_read(&lli->lli_trunc_sem);
867 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
868 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
871 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
872 vio->u.splice.cui_flags = args->u.splice.via_flags;
875 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
878 result = cl_io_loop(env, io);
879 if (write_mutex_locked)
880 cfs_mutex_unlock(&lli->lli_write_mutex);
881 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
882 cfs_up_read(&lli->lli_trunc_sem);
884 /* cl_io_rw_init() handled IO */
885 result = io->ci_result;
888 if (io->ci_nob > 0) {
890 *ppos = io->u.ci_wr.wr.crw_pos;
896 if (iot == CIT_READ) {
898 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
899 LPROC_LL_READ_BYTES, result);
900 } else if (iot == CIT_WRITE) {
902 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
903 LPROC_LL_WRITE_BYTES, result);
904 lli->lli_write_rc = 0;
906 lli->lli_write_rc = result;
915 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
917 static int ll_file_get_iov_count(const struct iovec *iov,
918 unsigned long *nr_segs, size_t *count)
923 for (seg = 0; seg < *nr_segs; seg++) {
924 const struct iovec *iv = &iov[seg];
927 * If any segment has a negative length, or the cumulative
928 * length ever wraps negative then return -EINVAL.
931 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
933 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
938 cnt -= iv->iov_len; /* This segment is no good */
945 #ifdef HAVE_FILE_READV
946 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
947 unsigned long nr_segs, loff_t *ppos)
950 struct vvp_io_args *args;
956 result = ll_file_get_iov_count(iov, &nr_segs, &count);
960 env = cl_env_get(&refcheck);
962 RETURN(PTR_ERR(env));
964 args = vvp_env_args(env, IO_NORMAL);
965 args->u.normal.via_iov = (struct iovec *)iov;
966 args->u.normal.via_nrsegs = nr_segs;
968 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
969 cl_env_put(env, &refcheck);
973 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
977 struct iovec *local_iov;
982 env = cl_env_get(&refcheck);
984 RETURN(PTR_ERR(env));
986 local_iov = &vvp_env_info(env)->vti_local_iov;
987 local_iov->iov_base = (void __user *)buf;
988 local_iov->iov_len = count;
989 result = ll_file_readv(file, local_iov, 1, ppos);
990 cl_env_put(env, &refcheck);
995 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
996 unsigned long nr_segs, loff_t pos)
999 struct vvp_io_args *args;
1005 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1009 env = cl_env_get(&refcheck);
1011 RETURN(PTR_ERR(env));
1013 args = vvp_env_args(env, IO_NORMAL);
1014 args->u.normal.via_iov = (struct iovec *)iov;
1015 args->u.normal.via_nrsegs = nr_segs;
1016 args->u.normal.via_iocb = iocb;
1018 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1019 &iocb->ki_pos, count);
1020 cl_env_put(env, &refcheck);
1024 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1028 struct iovec *local_iov;
1029 struct kiocb *kiocb;
1034 env = cl_env_get(&refcheck);
1036 RETURN(PTR_ERR(env));
1038 local_iov = &vvp_env_info(env)->vti_local_iov;
1039 kiocb = &vvp_env_info(env)->vti_kiocb;
1040 local_iov->iov_base = (void __user *)buf;
1041 local_iov->iov_len = count;
1042 init_sync_kiocb(kiocb, file);
1043 kiocb->ki_pos = *ppos;
1044 kiocb->ki_left = count;
1046 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1047 *ppos = kiocb->ki_pos;
1049 cl_env_put(env, &refcheck);
1055 * Write to a file (through the page cache).
1057 #ifdef HAVE_FILE_WRITEV
1058 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1059 unsigned long nr_segs, loff_t *ppos)
1062 struct vvp_io_args *args;
1068 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1072 env = cl_env_get(&refcheck);
1074 RETURN(PTR_ERR(env));
1076 args = vvp_env_args(env, IO_NORMAL);
1077 args->u.normal.via_iov = (struct iovec *)iov;
1078 args->u.normal.via_nrsegs = nr_segs;
1080 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1081 cl_env_put(env, &refcheck);
1085 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1089 struct iovec *local_iov;
1094 env = cl_env_get(&refcheck);
1096 RETURN(PTR_ERR(env));
1098 local_iov = &vvp_env_info(env)->vti_local_iov;
1099 local_iov->iov_base = (void __user *)buf;
1100 local_iov->iov_len = count;
1102 result = ll_file_writev(file, local_iov, 1, ppos);
1103 cl_env_put(env, &refcheck);
1107 #else /* AIO stuff */
1108 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1109 unsigned long nr_segs, loff_t pos)
1112 struct vvp_io_args *args;
1118 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1122 env = cl_env_get(&refcheck);
1124 RETURN(PTR_ERR(env));
1126 args = vvp_env_args(env, IO_NORMAL);
1127 args->u.normal.via_iov = (struct iovec *)iov;
1128 args->u.normal.via_nrsegs = nr_segs;
1129 args->u.normal.via_iocb = iocb;
1131 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1132 &iocb->ki_pos, count);
1133 cl_env_put(env, &refcheck);
1137 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1141 struct iovec *local_iov;
1142 struct kiocb *kiocb;
1147 env = cl_env_get(&refcheck);
1149 RETURN(PTR_ERR(env));
1151 local_iov = &vvp_env_info(env)->vti_local_iov;
1152 kiocb = &vvp_env_info(env)->vti_kiocb;
1153 local_iov->iov_base = (void __user *)buf;
1154 local_iov->iov_len = count;
1155 init_sync_kiocb(kiocb, file);
1156 kiocb->ki_pos = *ppos;
1157 kiocb->ki_left = count;
1159 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1160 *ppos = kiocb->ki_pos;
1162 cl_env_put(env, &refcheck);
1168 #ifdef HAVE_KERNEL_SENDFILE
1170 * Send file content (through pagecache) somewhere with helper
1172 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1173 read_actor_t actor, void *target)
1176 struct vvp_io_args *args;
1181 env = cl_env_get(&refcheck);
1183 RETURN(PTR_ERR(env));
1185 args = vvp_env_args(env, IO_SENDFILE);
1186 args->u.sendfile.via_target = target;
1187 args->u.sendfile.via_actor = actor;
1189 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1190 cl_env_put(env, &refcheck);
1195 #ifdef HAVE_KERNEL_SPLICE_READ
1197 * Send file content (through pagecache) somewhere with helper
1199 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1200 struct pipe_inode_info *pipe, size_t count,
1204 struct vvp_io_args *args;
1209 env = cl_env_get(&refcheck);
1211 RETURN(PTR_ERR(env));
1213 args = vvp_env_args(env, IO_SPLICE);
1214 args->u.splice.via_pipe = pipe;
1215 args->u.splice.via_flags = flags;
1217 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1218 cl_env_put(env, &refcheck);
1223 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1226 struct obd_export *exp = ll_i2dtexp(inode);
1227 struct obd_trans_info oti = { 0 };
1228 struct obdo *oa = NULL;
1231 struct lov_stripe_md *lsm = NULL, *lsm2;
1238 lsm = ccc_inode_lsm_get(inode);
1240 GOTO(out, rc = -ENOENT);
1242 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1243 (lsm->lsm_stripe_count));
1245 OBD_ALLOC_LARGE(lsm2, lsm_size);
1247 GOTO(out, rc = -ENOMEM);
1251 oa->o_nlink = ost_idx;
1252 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1253 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1254 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1255 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1256 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1257 memcpy(lsm2, lsm, lsm_size);
1258 ll_inode_size_lock(inode);
1259 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1260 ll_inode_size_unlock(inode);
1262 OBD_FREE_LARGE(lsm2, lsm_size);
1265 ccc_inode_lsm_put(inode, lsm);
1270 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1272 struct ll_recreate_obj ucreat;
1275 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1278 if (cfs_copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1279 sizeof(struct ll_recreate_obj)))
1282 RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1283 ucreat.lrc_ost_idx));
1286 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1293 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1296 if (cfs_copy_from_user(&fid, (struct lu_fid *)arg,
1297 sizeof(struct lu_fid)))
1300 id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1301 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1302 RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1305 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1306 int flags, struct lov_user_md *lum, int lum_size)
1308 struct lov_stripe_md *lsm = NULL;
1309 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1313 lsm = ccc_inode_lsm_get(inode);
1315 ccc_inode_lsm_put(inode, lsm);
1316 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1321 ll_inode_size_lock(inode);
1322 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1325 rc = oit.d.lustre.it_status;
1327 GOTO(out_req_free, rc);
1329 ll_release_openhandle(file->f_dentry, &oit);
1332 ll_inode_size_unlock(inode);
1333 ll_intent_release(&oit);
1334 ccc_inode_lsm_put(inode, lsm);
1337 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1341 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1342 struct lov_mds_md **lmmp, int *lmm_size,
1343 struct ptlrpc_request **request)
1345 struct ll_sb_info *sbi = ll_i2sbi(inode);
1346 struct mdt_body *body;
1347 struct lov_mds_md *lmm = NULL;
1348 struct ptlrpc_request *req = NULL;
1349 struct md_op_data *op_data;
1352 rc = ll_get_max_mdsize(sbi, &lmmsize);
1356 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1357 strlen(filename), lmmsize,
1358 LUSTRE_OPC_ANY, NULL);
1359 if (IS_ERR(op_data))
1360 RETURN(PTR_ERR(op_data));
1362 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1363 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1364 ll_finish_md_op_data(op_data);
1366 CDEBUG(D_INFO, "md_getattr_name failed "
1367 "on %s: rc %d\n", filename, rc);
1371 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1372 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1374 lmmsize = body->eadatasize;
1376 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1378 GOTO(out, rc = -ENODATA);
1381 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1382 LASSERT(lmm != NULL);
1384 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1385 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1386 GOTO(out, rc = -EPROTO);
1390 * This is coming from the MDS, so is probably in
1391 * little endian. We convert it to host endian before
1392 * passing it to userspace.
1394 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1395 /* if function called for directory - we should
1396 * avoid swab not existent lsm objects */
1397 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1398 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1399 if (S_ISREG(body->mode))
1400 lustre_swab_lov_user_md_objects(
1401 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1402 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1403 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1404 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1405 if (S_ISREG(body->mode))
1406 lustre_swab_lov_user_md_objects(
1407 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1408 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1414 *lmm_size = lmmsize;
1419 static int ll_lov_setea(struct inode *inode, struct file *file,
1422 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1423 struct lov_user_md *lump;
1424 int lum_size = sizeof(struct lov_user_md) +
1425 sizeof(struct lov_user_ost_data);
1429 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1432 OBD_ALLOC_LARGE(lump, lum_size);
1436 if (cfs_copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1437 OBD_FREE_LARGE(lump, lum_size);
1441 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1443 OBD_FREE_LARGE(lump, lum_size);
1447 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1450 struct lov_user_md_v3 lumv3;
1451 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1452 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1453 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1456 int flags = FMODE_WRITE;
1459 /* first try with v1 which is smaller than v3 */
1460 lum_size = sizeof(struct lov_user_md_v1);
1461 if (cfs_copy_from_user(lumv1, lumv1p, lum_size))
1464 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1465 lum_size = sizeof(struct lov_user_md_v3);
1466 if (cfs_copy_from_user(&lumv3, lumv3p, lum_size))
1470 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1472 struct lov_stripe_md *lsm;
1473 put_user(0, &lumv1p->lmm_stripe_count);
1474 lsm = ccc_inode_lsm_get(inode);
1475 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1476 0, lsm, (void *)arg);
1477 ccc_inode_lsm_put(inode, lsm);
1482 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1484 struct lov_stripe_md *lsm;
1488 lsm = ccc_inode_lsm_get(inode);
1490 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1492 ccc_inode_lsm_put(inode, lsm);
1496 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1498 struct ll_inode_info *lli = ll_i2info(inode);
1499 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1500 struct ccc_grouplock grouplock;
1504 if (ll_file_nolock(file))
1505 RETURN(-EOPNOTSUPP);
1507 cfs_spin_lock(&lli->lli_lock);
1508 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1509 CWARN("group lock already existed with gid %lu\n",
1510 fd->fd_grouplock.cg_gid);
1511 cfs_spin_unlock(&lli->lli_lock);
1514 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1515 cfs_spin_unlock(&lli->lli_lock);
1517 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1518 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1522 cfs_spin_lock(&lli->lli_lock);
1523 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1524 cfs_spin_unlock(&lli->lli_lock);
1525 CERROR("another thread just won the race\n");
1526 cl_put_grouplock(&grouplock);
1530 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1531 fd->fd_grouplock = grouplock;
1532 cfs_spin_unlock(&lli->lli_lock);
1534 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1538 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1540 struct ll_inode_info *lli = ll_i2info(inode);
1541 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1542 struct ccc_grouplock grouplock;
1545 cfs_spin_lock(&lli->lli_lock);
1546 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1547 cfs_spin_unlock(&lli->lli_lock);
1548 CWARN("no group lock held\n");
1551 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1553 if (fd->fd_grouplock.cg_gid != arg) {
1554 CWARN("group lock %lu doesn't match current id %lu\n",
1555 arg, fd->fd_grouplock.cg_gid);
1556 cfs_spin_unlock(&lli->lli_lock);
1560 grouplock = fd->fd_grouplock;
1561 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1562 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1563 cfs_spin_unlock(&lli->lli_lock);
1565 cl_put_grouplock(&grouplock);
1566 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1571 * Close inode open handle
1573 * \param dentry [in] dentry which contains the inode
1574 * \param it [in,out] intent which contains open info and result
1577 * \retval <0 failure
1579 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1581 struct inode *inode = dentry->d_inode;
1582 struct obd_client_handle *och;
1588 /* Root ? Do nothing. */
1589 if (dentry->d_inode->i_sb->s_root == dentry)
1592 /* No open handle to close? Move away */
1593 if (!it_disposition(it, DISP_OPEN_OPEN))
1596 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1598 OBD_ALLOC(och, sizeof(*och));
1600 GOTO(out, rc = -ENOMEM);
1602 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1603 ll_i2info(inode), it, och);
1605 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1608 /* this one is in place of ll_file_open */
1609 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1610 ptlrpc_req_finished(it->d.lustre.it_data);
1611 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1617 * Get size for inode for which FIEMAP mapping is requested.
1618 * Make the FIEMAP get_info call and returns the result.
1620 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1623 struct obd_export *exp = ll_i2dtexp(inode);
1624 struct lov_stripe_md *lsm = NULL;
1625 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1626 int vallen = num_bytes;
1630 /* Checks for fiemap flags */
1631 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1632 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1636 /* Check for FIEMAP_FLAG_SYNC */
1637 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1638 rc = filemap_fdatawrite(inode->i_mapping);
1643 lsm = ccc_inode_lsm_get(inode);
1647 /* If the stripe_count > 1 and the application does not understand
1648 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1650 if (lsm->lsm_stripe_count > 1 &&
1651 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1652 GOTO(out, rc = -EOPNOTSUPP);
1654 fm_key.oa.o_id = lsm->lsm_object_id;
1655 fm_key.oa.o_seq = lsm->lsm_object_seq;
1656 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1658 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1659 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1660 /* If filesize is 0, then there would be no objects for mapping */
1661 if (fm_key.oa.o_size == 0) {
1662 fiemap->fm_mapped_extents = 0;
1666 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1668 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1671 CERROR("obd_get_info failed: rc = %d\n", rc);
1674 ccc_inode_lsm_put(inode, lsm);
1678 int ll_fid2path(struct obd_export *exp, void *arg)
1680 struct getinfo_fid2path *gfout, *gfin;
1684 /* Need to get the buflen */
1685 OBD_ALLOC_PTR(gfin);
1688 if (cfs_copy_from_user(gfin, arg, sizeof(*gfin))) {
1693 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1694 OBD_ALLOC(gfout, outsize);
1695 if (gfout == NULL) {
1699 memcpy(gfout, gfin, sizeof(*gfout));
1702 /* Call mdc_iocontrol */
1703 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1706 if (cfs_copy_to_user(arg, gfout, outsize))
1710 OBD_FREE(gfout, outsize);
1714 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1716 struct ll_user_fiemap *fiemap_s;
1717 size_t num_bytes, ret_bytes;
1718 unsigned int extent_count;
1721 /* Get the extent count so we can calculate the size of
1722 * required fiemap buffer */
1723 if (get_user(extent_count,
1724 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1726 num_bytes = sizeof(*fiemap_s) + (extent_count *
1727 sizeof(struct ll_fiemap_extent));
1729 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1730 if (fiemap_s == NULL)
1733 /* get the fiemap value */
1734 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1736 GOTO(error, rc = -EFAULT);
1738 /* If fm_extent_count is non-zero, read the first extent since
1739 * it is used to calculate end_offset and device from previous
1742 if (copy_from_user(&fiemap_s->fm_extents[0],
1743 (char __user *)arg + sizeof(*fiemap_s),
1744 sizeof(struct ll_fiemap_extent)))
1745 GOTO(error, rc = -EFAULT);
1748 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1752 ret_bytes = sizeof(struct ll_user_fiemap);
1754 if (extent_count != 0)
1755 ret_bytes += (fiemap_s->fm_mapped_extents *
1756 sizeof(struct ll_fiemap_extent));
1758 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1762 OBD_FREE_LARGE(fiemap_s, num_bytes);
1767 * Read the data_version for inode.
1769 * This value is computed using stripe object version on OST.
1770 * Version is computed using server side locking.
1772 * @param extent_lock Take extent lock. Not needed if a process is already
1773 * holding the OST object group locks.
1775 static int ll_data_version(struct inode *inode, __u64 *data_version,
1778 struct lov_stripe_md *lsm = NULL;
1779 struct ll_sb_info *sbi = ll_i2sbi(inode);
1780 struct obdo *obdo = NULL;
1784 /* If no stripe, we consider version is 0. */
1785 lsm = ccc_inode_lsm_get(inode);
1788 CDEBUG(D_INODE, "No object for inode\n");
1792 OBD_ALLOC_PTR(obdo);
1794 ccc_inode_lsm_put(inode, lsm);
1798 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1800 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1803 *data_version = obdo->o_data_version;
1807 ccc_inode_lsm_put(inode, lsm);
1812 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1814 struct inode *inode = file->f_dentry->d_inode;
1815 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1820 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1821 inode->i_generation, inode, cmd);
1822 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1824 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1825 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1829 case LL_IOC_GETFLAGS:
1830 /* Get the current value of the file flags */
1831 return put_user(fd->fd_flags, (int *)arg);
1832 case LL_IOC_SETFLAGS:
1833 case LL_IOC_CLRFLAGS:
1834 /* Set or clear specific file flags */
1835 /* XXX This probably needs checks to ensure the flags are
1836 * not abused, and to handle any flag side effects.
1838 if (get_user(flags, (int *) arg))
1841 if (cmd == LL_IOC_SETFLAGS) {
1842 if ((flags & LL_FILE_IGNORE_LOCK) &&
1843 !(file->f_flags & O_DIRECT)) {
1844 CERROR("%s: unable to disable locking on "
1845 "non-O_DIRECT file\n", current->comm);
1849 fd->fd_flags |= flags;
1851 fd->fd_flags &= ~flags;
1854 case LL_IOC_LOV_SETSTRIPE:
1855 RETURN(ll_lov_setstripe(inode, file, arg));
1856 case LL_IOC_LOV_SETEA:
1857 RETURN(ll_lov_setea(inode, file, arg));
1858 case LL_IOC_LOV_GETSTRIPE:
1859 RETURN(ll_lov_getstripe(inode, arg));
1860 case LL_IOC_RECREATE_OBJ:
1861 RETURN(ll_lov_recreate_obj(inode, arg));
1862 case LL_IOC_RECREATE_FID:
1863 RETURN(ll_lov_recreate_fid(inode, arg));
1864 case FSFILT_IOC_FIEMAP:
1865 RETURN(ll_ioctl_fiemap(inode, arg));
1866 case FSFILT_IOC_GETFLAGS:
1867 case FSFILT_IOC_SETFLAGS:
1868 RETURN(ll_iocontrol(inode, file, cmd, arg));
1869 case FSFILT_IOC_GETVERSION_OLD:
1870 case FSFILT_IOC_GETVERSION:
1871 RETURN(put_user(inode->i_generation, (int *)arg));
1872 case LL_IOC_GROUP_LOCK:
1873 RETURN(ll_get_grouplock(inode, file, arg));
1874 case LL_IOC_GROUP_UNLOCK:
1875 RETURN(ll_put_grouplock(inode, file, arg));
1876 case IOC_OBD_STATFS:
1877 RETURN(ll_obd_statfs(inode, (void *)arg));
1879 /* We need to special case any other ioctls we want to handle,
1880 * to send them to the MDS/OST as appropriate and to properly
1881 * network encode the arg field.
1882 case FSFILT_IOC_SETVERSION_OLD:
1883 case FSFILT_IOC_SETVERSION:
1885 case LL_IOC_FLUSHCTX:
1886 RETURN(ll_flush_ctx(inode));
1887 case LL_IOC_PATH2FID: {
1888 if (cfs_copy_to_user((void *)arg, ll_inode2fid(inode),
1889 sizeof(struct lu_fid)))
1894 case OBD_IOC_FID2PATH:
1895 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1896 case LL_IOC_DATA_VERSION: {
1897 struct ioc_data_version idv;
1900 if (cfs_copy_from_user(&idv, (char *)arg, sizeof(idv)))
1903 rc = ll_data_version(inode, &idv.idv_version,
1904 !(idv.idv_flags & LL_DV_NOFLUSH));
1907 cfs_copy_to_user((char *) arg, &idv, sizeof(idv)))
1913 case LL_IOC_GET_MDTIDX: {
1916 mdtidx = ll_get_mdt_idx(inode);
1920 if (put_user((int)mdtidx, (int*)arg))
1925 case OBD_IOC_GETDTNAME:
1926 case OBD_IOC_GETMDNAME:
1927 RETURN(ll_get_obd_name(inode, cmd, arg));
1932 ll_iocontrol_call(inode, file, cmd, arg, &err))
1935 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1941 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1943 struct inode *inode = file->f_dentry->d_inode;
1946 retval = offset + ((origin == 2) ? i_size_read(inode) :
1947 (origin == 1) ? file->f_pos : 0);
1948 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%s)\n",
1949 inode->i_ino, inode->i_generation, inode, retval, retval,
1950 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1951 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1953 if (origin == 2) { /* SEEK_END */
1956 rc = ll_glimpse_size(inode);
1960 offset += i_size_read(inode);
1961 } else if (origin == 1) { /* SEEK_CUR */
1962 offset += file->f_pos;
1966 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1967 if (offset != file->f_pos) {
1968 file->f_pos = offset;
1976 int ll_flush(struct file *file, fl_owner_t id)
1978 struct inode *inode = file->f_dentry->d_inode;
1979 struct ll_inode_info *lli = ll_i2info(inode);
1982 LASSERT(!S_ISDIR(inode->i_mode));
1984 /* the application should know write failure already. */
1985 if (lli->lli_write_rc)
1988 /* catch async errors that were recorded back when async writeback
1989 * failed for pages in this mapping. */
1990 rc = lli->lli_async_rc;
1991 lli->lli_async_rc = 0;
1992 err = lov_read_and_clear_async_rc(lli->lli_clob);
1996 return rc ? -EIO : 0;
2000 * Called to make sure a portion of file has been written out.
2001 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2003 * Return how many pages have been written.
2005 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2006 enum cl_fsync_mode mode)
2008 struct cl_env_nest nest;
2011 struct obd_capa *capa = NULL;
2012 struct cl_fsync_io *fio;
2016 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2017 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2020 env = cl_env_nested_get(&nest);
2022 RETURN(PTR_ERR(env));
2024 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2026 io = ccc_env_thread_io(env);
2027 io->ci_obj = cl_i2info(inode)->lli_clob;
2028 io->ci_ignore_layout = 1;
2030 /* initialize parameters for sync */
2031 fio = &io->u.ci_fsync;
2032 fio->fi_capa = capa;
2033 fio->fi_start = start;
2035 fio->fi_fid = ll_inode2fid(inode);
2036 fio->fi_mode = mode;
2037 fio->fi_nr_written = 0;
2039 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2040 result = cl_io_loop(env, io);
2042 result = io->ci_result;
2044 result = fio->fi_nr_written;
2045 cl_io_fini(env, io);
2046 cl_env_nested_put(&nest, env);
2053 #ifdef HAVE_FILE_FSYNC_4ARGS
2054 int ll_fsync(struct file *file, loff_t start, loff_t end, int data)
2055 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2056 int ll_fsync(struct file *file, int data)
2058 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2061 struct inode *inode = file->f_dentry->d_inode;
2062 struct ll_inode_info *lli = ll_i2info(inode);
2063 struct ptlrpc_request *req;
2064 struct obd_capa *oc;
2065 struct lov_stripe_md *lsm;
2068 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2069 inode->i_generation, inode);
2070 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2072 /* fsync's caller has already called _fdata{sync,write}, we want
2073 * that IO to finish before calling the osc and mdc sync methods */
2074 rc = filemap_fdatawait(inode->i_mapping);
2076 /* catch async errors that were recorded back when async writeback
2077 * failed for pages in this mapping. */
2078 if (!S_ISDIR(inode->i_mode)) {
2079 err = lli->lli_async_rc;
2080 lli->lli_async_rc = 0;
2083 err = lov_read_and_clear_async_rc(lli->lli_clob);
2088 oc = ll_mdscapa_get(inode);
2089 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2095 ptlrpc_req_finished(req);
2097 lsm = ccc_inode_lsm_get(inode);
2099 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2101 if (rc == 0 && err < 0)
2103 lli->lli_write_rc = rc < 0 ? rc : 0;
2105 ccc_inode_lsm_put(inode, lsm);
2110 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2112 struct inode *inode = file->f_dentry->d_inode;
2113 struct ll_sb_info *sbi = ll_i2sbi(inode);
2114 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2115 .ei_cb_cp =ldlm_flock_completion_ast,
2116 .ei_cbdata = file_lock };
2117 struct md_op_data *op_data;
2118 struct lustre_handle lockh = {0};
2119 ldlm_policy_data_t flock = {{0}};
2124 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2125 inode->i_ino, file_lock);
2127 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2129 if (file_lock->fl_flags & FL_FLOCK) {
2130 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2131 /* flocks are whole-file locks */
2132 flock.l_flock.end = OFFSET_MAX;
2133 /* For flocks owner is determined by the local file desctiptor*/
2134 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2135 } else if (file_lock->fl_flags & FL_POSIX) {
2136 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2137 flock.l_flock.start = file_lock->fl_start;
2138 flock.l_flock.end = file_lock->fl_end;
2142 flock.l_flock.pid = file_lock->fl_pid;
2144 /* Somewhat ugly workaround for svc lockd.
2145 * lockd installs custom fl_lmops->fl_compare_owner that checks
2146 * for the fl_owner to be the same (which it always is on local node
2147 * I guess between lockd processes) and then compares pid.
2148 * As such we assign pid to the owner field to make it all work,
2149 * conflict with normal locks is unlikely since pid space and
2150 * pointer space for current->files are not intersecting */
2151 if (file_lock->fl_lmops && file_lock->fl_lmops->fl_compare_owner)
2152 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2154 switch (file_lock->fl_type) {
2156 einfo.ei_mode = LCK_PR;
2159 /* An unlock request may or may not have any relation to
2160 * existing locks so we may not be able to pass a lock handle
2161 * via a normal ldlm_lock_cancel() request. The request may even
2162 * unlock a byte range in the middle of an existing lock. In
2163 * order to process an unlock request we need all of the same
2164 * information that is given with a normal read or write record
2165 * lock request. To avoid creating another ldlm unlock (cancel)
2166 * message we'll treat a LCK_NL flock request as an unlock. */
2167 einfo.ei_mode = LCK_NL;
2170 einfo.ei_mode = LCK_PW;
2173 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2174 file_lock->fl_type);
2189 flags = LDLM_FL_BLOCK_NOWAIT;
2195 flags = LDLM_FL_TEST_LOCK;
2196 /* Save the old mode so that if the mode in the lock changes we
2197 * can decrement the appropriate reader or writer refcount. */
2198 file_lock->fl_type = einfo.ei_mode;
2201 CERROR("unknown fcntl lock command: %d\n", cmd);
2205 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2206 LUSTRE_OPC_ANY, NULL);
2207 if (IS_ERR(op_data))
2208 RETURN(PTR_ERR(op_data));
2210 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2211 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2212 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2214 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2215 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2217 ll_finish_md_op_data(op_data);
2219 if ((file_lock->fl_flags & FL_FLOCK) &&
2220 (rc == 0 || file_lock->fl_type == F_UNLCK))
2221 flock_lock_file_wait(file, file_lock);
2222 if ((file_lock->fl_flags & FL_POSIX) &&
2223 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2224 !(flags & LDLM_FL_TEST_LOCK))
2225 posix_lock_file_wait(file, file_lock);
2230 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2238 * test if some locks matching bits and l_req_mode are acquired
2239 * - bits can be in different locks
2240 * - if found clear the common lock bits in *bits
2241 * - the bits not found, are kept in *bits
2243 * \param bits [IN] searched lock bits [IN]
2244 * \param l_req_mode [IN] searched lock mode
2245 * \retval boolean, true iff all bits are found
2247 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2249 struct lustre_handle lockh;
2250 ldlm_policy_data_t policy;
2251 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2252 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2261 fid = &ll_i2info(inode)->lli_fid;
2262 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2263 ldlm_lockname[mode]);
2265 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2266 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2267 policy.l_inodebits.bits = *bits & (1 << i);
2268 if (policy.l_inodebits.bits == 0)
2271 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2272 &policy, mode, &lockh)) {
2273 struct ldlm_lock *lock;
2275 lock = ldlm_handle2lock(&lockh);
2278 ~(lock->l_policy_data.l_inodebits.bits);
2279 LDLM_LOCK_PUT(lock);
2281 *bits &= ~policy.l_inodebits.bits;
2288 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2289 struct lustre_handle *lockh)
2291 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2297 fid = &ll_i2info(inode)->lli_fid;
2298 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2300 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2301 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2302 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2306 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2307 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2308 * and return success */
2310 /* This path cannot be hit for regular files unless in
2311 * case of obscure races, so no need to to validate
2313 if (!S_ISREG(inode->i_mode) &&
2314 !S_ISDIR(inode->i_mode))
2319 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2327 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2330 struct inode *inode = dentry->d_inode;
2331 struct ptlrpc_request *req = NULL;
2332 struct obd_export *exp;
2337 CERROR("REPORT THIS LINE TO PETER\n");
2341 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2342 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2344 exp = ll_i2mdexp(inode);
2346 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2347 * But under CMD case, it caused some lock issues, should be fixed
2348 * with new CMD ibits lock. See bug 12718 */
2349 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2350 struct lookup_intent oit = { .it_op = IT_GETATTR };
2351 struct md_op_data *op_data;
2353 if (ibits == MDS_INODELOCK_LOOKUP)
2354 oit.it_op = IT_LOOKUP;
2356 /* Call getattr by fid, so do not provide name at all. */
2357 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2358 dentry->d_inode, NULL, 0, 0,
2359 LUSTRE_OPC_ANY, NULL);
2360 if (IS_ERR(op_data))
2361 RETURN(PTR_ERR(op_data));
2363 oit.it_create_mode |= M_CHECK_STALE;
2364 rc = md_intent_lock(exp, op_data, NULL, 0,
2365 /* we are not interested in name
2368 ll_md_blocking_ast, 0);
2369 ll_finish_md_op_data(op_data);
2370 oit.it_create_mode &= ~M_CHECK_STALE;
2372 rc = ll_inode_revalidate_fini(inode, rc);
2376 rc = ll_revalidate_it_finish(req, &oit, dentry);
2378 ll_intent_release(&oit);
2382 /* Unlinked? Unhash dentry, so it is not picked up later by
2383 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2384 here to preserve get_cwd functionality on 2.6.
2386 if (!dentry->d_inode->i_nlink)
2387 d_lustre_invalidate(dentry);
2389 ll_lookup_finish_locks(&oit, dentry);
2390 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2391 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2392 obd_valid valid = OBD_MD_FLGETATTR;
2393 struct md_op_data *op_data;
2396 if (S_ISREG(inode->i_mode)) {
2397 rc = ll_get_max_mdsize(sbi, &ealen);
2400 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2403 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2404 0, ealen, LUSTRE_OPC_ANY,
2406 if (IS_ERR(op_data))
2407 RETURN(PTR_ERR(op_data));
2409 op_data->op_valid = valid;
2410 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2411 * capa for this inode. Because we only keep capas of dirs
2413 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2414 ll_finish_md_op_data(op_data);
2416 rc = ll_inode_revalidate_fini(inode, rc);
2420 rc = ll_prep_inode(&inode, req, NULL);
2423 ptlrpc_req_finished(req);
2427 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2430 struct inode *inode = dentry->d_inode;
2434 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2436 /* if object not yet allocated, don't validate size */
2437 if (rc == 0 && !ll_i2info(dentry->d_inode)->lli_has_smd) {
2438 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2439 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2440 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2444 /* ll_glimpse_size will prefer locally cached writes if they extend
2448 rc = ll_glimpse_size(inode);
2453 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2454 struct lookup_intent *it, struct kstat *stat)
2456 struct inode *inode = de->d_inode;
2457 struct ll_sb_info *sbi = ll_i2sbi(inode);
2458 struct ll_inode_info *lli = ll_i2info(inode);
2461 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2462 MDS_INODELOCK_LOOKUP);
2463 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2468 stat->dev = inode->i_sb->s_dev;
2469 if (ll_need_32bit_api(sbi))
2470 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2472 stat->ino = inode->i_ino;
2473 stat->mode = inode->i_mode;
2474 stat->nlink = inode->i_nlink;
2475 stat->uid = inode->i_uid;
2476 stat->gid = inode->i_gid;
2477 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2478 stat->atime = inode->i_atime;
2479 stat->mtime = inode->i_mtime;
2480 stat->ctime = inode->i_ctime;
2481 stat->blksize = 1 << inode->i_blkbits;
2483 stat->size = i_size_read(inode);
2484 stat->blocks = inode->i_blocks;
2488 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2490 struct lookup_intent it = { .it_op = IT_GETATTR };
2492 return ll_getattr_it(mnt, de, &it, stat);
2495 #ifdef HAVE_LINUX_FIEMAP_H
2496 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2497 __u64 start, __u64 len)
2501 struct ll_user_fiemap *fiemap;
2502 unsigned int extent_count = fieinfo->fi_extents_max;
2504 num_bytes = sizeof(*fiemap) + (extent_count *
2505 sizeof(struct ll_fiemap_extent));
2506 OBD_ALLOC_LARGE(fiemap, num_bytes);
2511 fiemap->fm_flags = fieinfo->fi_flags;
2512 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2513 fiemap->fm_start = start;
2514 fiemap->fm_length = len;
2515 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2516 sizeof(struct ll_fiemap_extent));
2518 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2520 fieinfo->fi_flags = fiemap->fm_flags;
2521 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2522 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2523 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2525 OBD_FREE_LARGE(fiemap, num_bytes);
2530 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
2532 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2533 lustre_check_acl(struct inode *inode, int mask, unsigned int flags)
2535 lustre_check_acl(struct inode *inode, int mask)
2538 # ifdef CONFIG_FS_POSIX_ACL
2539 struct ll_inode_info *lli = ll_i2info(inode);
2540 struct posix_acl *acl;
2544 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
2545 if (flags & IPERM_FLAG_RCU)
2548 cfs_spin_lock(&lli->lli_lock);
2549 acl = posix_acl_dup(lli->lli_posix_acl);
2550 cfs_spin_unlock(&lli->lli_lock);
2555 rc = posix_acl_permission(inode, acl, mask);
2556 posix_acl_release(acl);
2559 # else /* !CONFIG_FS_POSIX_ACL */
2561 # endif /* CONFIG_FS_POSIX_ACL */
2563 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
2565 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2566 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2568 # ifdef HAVE_INODE_PERMISION_2ARGS
2569 int ll_inode_permission(struct inode *inode, int mask)
2571 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2578 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2579 if (flags & IPERM_FLAG_RCU)
2583 /* as root inode are NOT getting validated in lookup operation,
2584 * need to do it before permission check. */
2586 if (inode == inode->i_sb->s_root->d_inode) {
2587 struct lookup_intent it = { .it_op = IT_LOOKUP };
2589 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2590 MDS_INODELOCK_LOOKUP);
2595 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2596 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2598 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2599 return lustre_check_remote_perm(inode, mask);
2601 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2602 rc = ll_generic_permission(inode, mask, flags, lustre_check_acl);
2607 #ifdef HAVE_FILE_READV
2608 #define READ_METHOD readv
2609 #define READ_FUNCTION ll_file_readv
2610 #define WRITE_METHOD writev
2611 #define WRITE_FUNCTION ll_file_writev
2613 #define READ_METHOD aio_read
2614 #define READ_FUNCTION ll_file_aio_read
2615 #define WRITE_METHOD aio_write
2616 #define WRITE_FUNCTION ll_file_aio_write
2619 /* -o localflock - only provides locally consistent flock locks */
2620 struct file_operations ll_file_operations = {
2621 .read = ll_file_read,
2622 .READ_METHOD = READ_FUNCTION,
2623 .write = ll_file_write,
2624 .WRITE_METHOD = WRITE_FUNCTION,
2625 .unlocked_ioctl = ll_file_ioctl,
2626 .open = ll_file_open,
2627 .release = ll_file_release,
2628 .mmap = ll_file_mmap,
2629 .llseek = ll_file_seek,
2630 #ifdef HAVE_KERNEL_SENDFILE
2631 .sendfile = ll_file_sendfile,
2633 #ifdef HAVE_KERNEL_SPLICE_READ
2634 .splice_read = ll_file_splice_read,
2640 struct file_operations ll_file_operations_flock = {
2641 .read = ll_file_read,
2642 .READ_METHOD = READ_FUNCTION,
2643 .write = ll_file_write,
2644 .WRITE_METHOD = WRITE_FUNCTION,
2645 .unlocked_ioctl = ll_file_ioctl,
2646 .open = ll_file_open,
2647 .release = ll_file_release,
2648 .mmap = ll_file_mmap,
2649 .llseek = ll_file_seek,
2650 #ifdef HAVE_KERNEL_SENDFILE
2651 .sendfile = ll_file_sendfile,
2653 #ifdef HAVE_KERNEL_SPLICE_READ
2654 .splice_read = ll_file_splice_read,
2658 .flock = ll_file_flock,
2659 .lock = ll_file_flock
2662 /* These are for -o noflock - to return ENOSYS on flock calls */
2663 struct file_operations ll_file_operations_noflock = {
2664 .read = ll_file_read,
2665 .READ_METHOD = READ_FUNCTION,
2666 .write = ll_file_write,
2667 .WRITE_METHOD = WRITE_FUNCTION,
2668 .unlocked_ioctl = ll_file_ioctl,
2669 .open = ll_file_open,
2670 .release = ll_file_release,
2671 .mmap = ll_file_mmap,
2672 .llseek = ll_file_seek,
2673 #ifdef HAVE_KERNEL_SENDFILE
2674 .sendfile = ll_file_sendfile,
2676 #ifdef HAVE_KERNEL_SPLICE_READ
2677 .splice_read = ll_file_splice_read,
2681 .flock = ll_file_noflock,
2682 .lock = ll_file_noflock
2685 struct inode_operations ll_file_inode_operations = {
2686 .setattr = ll_setattr,
2687 .truncate = ll_truncate,
2688 .getattr = ll_getattr,
2689 .permission = ll_inode_permission,
2690 .setxattr = ll_setxattr,
2691 .getxattr = ll_getxattr,
2692 .listxattr = ll_listxattr,
2693 .removexattr = ll_removexattr,
2694 #ifdef HAVE_LINUX_FIEMAP_H
2695 .fiemap = ll_fiemap,
2699 /* dynamic ioctl number support routins */
2700 static struct llioc_ctl_data {
2701 cfs_rw_semaphore_t ioc_sem;
2702 cfs_list_t ioc_head;
2704 __RWSEM_INITIALIZER(llioc.ioc_sem),
2705 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2710 cfs_list_t iocd_list;
2711 unsigned int iocd_size;
2712 llioc_callback_t iocd_cb;
2713 unsigned int iocd_count;
2714 unsigned int iocd_cmd[0];
2717 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2720 struct llioc_data *in_data = NULL;
2723 if (cb == NULL || cmd == NULL ||
2724 count > LLIOC_MAX_CMD || count < 0)
2727 size = sizeof(*in_data) + count * sizeof(unsigned int);
2728 OBD_ALLOC(in_data, size);
2729 if (in_data == NULL)
2732 memset(in_data, 0, sizeof(*in_data));
2733 in_data->iocd_size = size;
2734 in_data->iocd_cb = cb;
2735 in_data->iocd_count = count;
2736 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2738 cfs_down_write(&llioc.ioc_sem);
2739 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2740 cfs_up_write(&llioc.ioc_sem);
2745 void ll_iocontrol_unregister(void *magic)
2747 struct llioc_data *tmp;
2752 cfs_down_write(&llioc.ioc_sem);
2753 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2755 unsigned int size = tmp->iocd_size;
2757 cfs_list_del(&tmp->iocd_list);
2758 cfs_up_write(&llioc.ioc_sem);
2760 OBD_FREE(tmp, size);
2764 cfs_up_write(&llioc.ioc_sem);
2766 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2769 EXPORT_SYMBOL(ll_iocontrol_register);
2770 EXPORT_SYMBOL(ll_iocontrol_unregister);
2772 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2773 unsigned int cmd, unsigned long arg, int *rcp)
2775 enum llioc_iter ret = LLIOC_CONT;
2776 struct llioc_data *data;
2777 int rc = -EINVAL, i;
2779 cfs_down_read(&llioc.ioc_sem);
2780 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2781 for (i = 0; i < data->iocd_count; i++) {
2782 if (cmd != data->iocd_cmd[i])
2785 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2789 if (ret == LLIOC_STOP)
2792 cfs_up_read(&llioc.ioc_sem);
2799 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
2801 struct ll_inode_info *lli = ll_i2info(inode);
2802 struct cl_env_nest nest;
2807 if (lli->lli_clob == NULL)
2810 env = cl_env_nested_get(&nest);
2812 RETURN(PTR_ERR(env));
2814 result = cl_conf_set(env, lli->lli_clob, conf);
2815 cl_env_nested_put(&nest, env);
2820 * This function checks if there exists a LAYOUT lock on the client side,
2821 * or enqueues it if it doesn't have one in cache.
2823 * This function will not hold layout lock so it may be revoked any time after
2824 * this function returns. Any operations depend on layout should be redone
2827 * This function should be called before lov_io_init() to get an uptodate
2828 * layout version, the caller should save the version number and after IO
2829 * is finished, this function should be called again to verify that layout
2830 * is not changed during IO time.
2832 int ll_layout_refresh(struct inode *inode, __u32 *gen)
2834 struct ll_inode_info *lli = ll_i2info(inode);
2835 struct ll_sb_info *sbi = ll_i2sbi(inode);
2836 struct md_op_data *op_data = NULL;
2837 struct ptlrpc_request *req = NULL;
2838 struct lookup_intent it = { .it_op = IT_LAYOUT };
2839 struct lustre_handle lockh;
2841 struct cl_object_conf conf = { .coc_inode = inode,
2842 .coc_validate_only = true };
2847 if (!(ll_i2sbi(inode)->ll_flags & LL_SBI_LAYOUT_LOCK))
2851 LASSERT(fid_is_sane(ll_inode2fid(inode)));
2852 LASSERT(S_ISREG(inode->i_mode));
2854 /* mostly layout lock is caching on the local side, so try to match
2855 * it before grabbing layout lock mutex. */
2856 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh);
2857 if (mode != 0) { /* hit cached lock */
2858 struct lov_stripe_md *lsm;
2860 lsm = ccc_inode_lsm_get(inode);
2862 *gen = lsm->lsm_layout_gen;
2863 ccc_inode_lsm_put(inode, lsm);
2864 ldlm_lock_decref(&lockh, mode);
2869 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
2870 0, 0, LUSTRE_OPC_ANY, NULL);
2871 if (IS_ERR(op_data))
2872 RETURN(PTR_ERR(op_data));
2874 /* take layout lock mutex to enqueue layout lock exclusively. */
2875 cfs_mutex_lock(&lli->lli_layout_mutex);
2877 /* make sure the old conf goes away */
2878 ll_layout_conf(inode, &conf);
2880 /* enqueue layout lock */
2881 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0,
2882 &req, ll_md_blocking_ast, 0);
2884 /* we get a new lock, so update the lock data */
2885 lockh.cookie = it.d.lustre.it_lock_handle;
2886 md_set_lock_data(sbi->ll_md_exp, &lockh.cookie, inode, NULL);
2888 /* req == NULL is when lock was found in client cache, without
2889 * any request to server (but lsm can be canceled just after a
2892 struct ldlm_lock *lock = ldlm_handle2lock(&lockh);
2893 struct lustre_md md = { NULL };
2897 /* for IT_LAYOUT lock, lmm is returned in lock's lvb
2898 * data via completion callback */
2899 LASSERT(lock != NULL);
2900 lmm = lock->l_lvb_data;
2901 lmmsize = lock->l_lvb_len;
2903 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
2907 *gen = md.lsm->lsm_layout_gen;
2909 memset(&conf, 0, sizeof conf);
2910 conf.coc_inode = inode;
2911 conf.u.coc_md = &md;
2912 ll_layout_conf(inode, &conf);
2914 lli->lli_has_smd = md.lsm != NULL;
2917 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
2919 LDLM_LOCK_PUT(lock);
2920 ptlrpc_req_finished(req);
2921 } else { /* hit caching lock */
2922 struct lov_stripe_md *lsm;
2924 lsm = ccc_inode_lsm_get(inode);
2926 *gen = lsm->lsm_layout_gen;
2927 ccc_inode_lsm_put(inode, lsm);
2929 ll_intent_drop_lock(&it);
2931 cfs_mutex_unlock(&lli->lli_layout_mutex);
2932 ll_finish_md_op_data(op_data);