4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Whamcloud, Inc.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
51 #include "cl_object.h"
53 struct ll_file_data *ll_file_data_get(void)
55 struct ll_file_data *fd;
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
61 static void ll_file_data_put(struct ll_file_data *fd)
64 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
67 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
68 struct lustre_handle *fh)
70 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
71 op_data->op_attr.ia_mode = inode->i_mode;
72 op_data->op_attr.ia_atime = inode->i_atime;
73 op_data->op_attr.ia_mtime = inode->i_mtime;
74 op_data->op_attr.ia_ctime = inode->i_ctime;
75 op_data->op_attr.ia_size = i_size_read(inode);
76 op_data->op_attr_blocks = inode->i_blocks;
77 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
78 ll_inode_to_ext_flags(inode->i_flags);
79 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
81 op_data->op_handle = *fh;
82 op_data->op_capa1 = ll_mdscapa_get(inode);
86 * Closes the IO epoch and packs all the attributes into @op_data for
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
95 ATTR_MTIME_SET | ATTR_CTIME_SET;
97 if (!(och->och_flags & FMODE_WRITE))
100 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
101 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
103 ll_ioepoch_close(inode, op_data, &och, 0);
106 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
107 ll_prep_md_op_data(op_data, inode, NULL, NULL,
108 0, 0, LUSTRE_OPC_ANY, NULL);
112 static int ll_close_inode_openhandle(struct obd_export *md_exp,
114 struct obd_client_handle *och)
116 struct obd_export *exp = ll_i2mdexp(inode);
117 struct md_op_data *op_data;
118 struct ptlrpc_request *req = NULL;
119 struct obd_device *obd = class_exp2obd(exp);
126 * XXX: in case of LMV, is this correct to access
129 CERROR("Invalid MDC connection handle "LPX64"\n",
130 ll_i2mdexp(inode)->exp_handle.h_cookie);
134 OBD_ALLOC_PTR(op_data);
136 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
138 ll_prepare_close(inode, op_data, och);
139 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
140 rc = md_close(md_exp, op_data, och->och_mod, &req);
142 /* This close must have the epoch closed. */
143 LASSERT(epoch_close);
144 /* MDS has instructed us to obtain Size-on-MDS attribute from
145 * OSTs and send setattr to back to MDS. */
146 rc = ll_som_update(inode, op_data);
148 CERROR("inode %lu mdc Size-on-MDS update failed: "
149 "rc = %d\n", inode->i_ino, rc);
153 CERROR("inode %lu mdc close failed: rc = %d\n",
156 ll_finish_md_op_data(op_data);
159 rc = ll_objects_destroy(req, inode);
161 CERROR("inode %lu ll_objects destroy: rc = %d\n",
168 if (exp_connect_som(exp) && !epoch_close &&
169 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
170 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
172 md_clear_open_replay_data(md_exp, och);
173 /* Free @och if it is not waiting for DONE_WRITING. */
174 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
177 if (req) /* This is close request */
178 ptlrpc_req_finished(req);
182 int ll_md_real_close(struct inode *inode, int flags)
184 struct ll_inode_info *lli = ll_i2info(inode);
185 struct obd_client_handle **och_p;
186 struct obd_client_handle *och;
191 if (flags & FMODE_WRITE) {
192 och_p = &lli->lli_mds_write_och;
193 och_usecount = &lli->lli_open_fd_write_count;
194 } else if (flags & FMODE_EXEC) {
195 och_p = &lli->lli_mds_exec_och;
196 och_usecount = &lli->lli_open_fd_exec_count;
198 LASSERT(flags & FMODE_READ);
199 och_p = &lli->lli_mds_read_och;
200 och_usecount = &lli->lli_open_fd_read_count;
203 cfs_mutex_lock(&lli->lli_och_mutex);
204 if (*och_usecount) { /* There are still users of this handle, so
206 cfs_mutex_unlock(&lli->lli_och_mutex);
211 cfs_mutex_unlock(&lli->lli_och_mutex);
213 if (och) { /* There might be a race and somebody have freed this och
215 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
222 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
225 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
226 struct ll_inode_info *lli = ll_i2info(inode);
230 /* clear group lock, if present */
231 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
232 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
234 /* Let's see if we have good enough OPEN lock on the file and if
235 we can skip talking to MDS */
236 if (file->f_dentry->d_inode) { /* Can this ever be false? */
238 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
239 struct lustre_handle lockh;
240 struct inode *inode = file->f_dentry->d_inode;
241 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
243 cfs_mutex_lock(&lli->lli_och_mutex);
244 if (fd->fd_omode & FMODE_WRITE) {
246 LASSERT(lli->lli_open_fd_write_count);
247 lli->lli_open_fd_write_count--;
248 } else if (fd->fd_omode & FMODE_EXEC) {
250 LASSERT(lli->lli_open_fd_exec_count);
251 lli->lli_open_fd_exec_count--;
254 LASSERT(lli->lli_open_fd_read_count);
255 lli->lli_open_fd_read_count--;
257 cfs_mutex_unlock(&lli->lli_och_mutex);
259 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
260 LDLM_IBITS, &policy, lockmode,
262 rc = ll_md_real_close(file->f_dentry->d_inode,
266 CERROR("Releasing a file %p with negative dentry %p. Name %s",
267 file, file->f_dentry, file->f_dentry->d_name.name);
270 LUSTRE_FPRIVATE(file) = NULL;
271 ll_file_data_put(fd);
272 ll_capa_close(inode);
277 /* While this returns an error code, fput() the caller does not, so we need
278 * to make every effort to clean up all of our state here. Also, applications
279 * rarely check close errors and even if an error is returned they will not
280 * re-try the close call.
282 int ll_file_release(struct inode *inode, struct file *file)
284 struct ll_file_data *fd;
285 struct ll_sb_info *sbi = ll_i2sbi(inode);
286 struct ll_inode_info *lli = ll_i2info(inode);
290 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
291 inode->i_generation, inode);
293 #ifdef CONFIG_FS_POSIX_ACL
294 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
295 inode == inode->i_sb->s_root->d_inode) {
296 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
299 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
300 fd->fd_flags &= ~LL_FILE_RMTACL;
301 rct_del(&sbi->ll_rct, cfs_curproc_pid());
302 et_search_free(&sbi->ll_et, cfs_curproc_pid());
307 if (inode->i_sb->s_root != file->f_dentry)
308 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
309 fd = LUSTRE_FPRIVATE(file);
312 /* The last ref on @file, maybe not the the owner pid of statahead.
313 * Different processes can open the same dir, "ll_opendir_key" means:
314 * it is me that should stop the statahead thread. */
315 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
316 lli->lli_opendir_pid != 0)
317 ll_stop_statahead(inode, lli->lli_opendir_key);
319 if (inode->i_sb->s_root == file->f_dentry) {
320 LUSTRE_FPRIVATE(file) = NULL;
321 ll_file_data_put(fd);
325 if (!S_ISDIR(inode->i_mode)) {
326 lov_read_and_clear_async_rc(lli->lli_clob);
327 lli->lli_async_rc = 0;
330 rc = ll_md_close(sbi->ll_md_exp, inode, file);
332 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
333 libcfs_debug_dumplog();
338 static int ll_intent_file_open(struct file *file, void *lmm,
339 int lmmsize, struct lookup_intent *itp)
341 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
342 struct dentry *parent = file->f_dentry->d_parent;
343 const char *name = file->f_dentry->d_name.name;
344 const int len = file->f_dentry->d_name.len;
345 struct md_op_data *op_data;
346 struct ptlrpc_request *req;
347 __u32 opc = LUSTRE_OPC_ANY;
354 /* Usually we come here only for NFSD, and we want open lock.
355 But we can also get here with pre 2.6.15 patchless kernels, and in
356 that case that lock is also ok */
357 /* We can also get here if there was cached open handle in revalidate_it
358 * but it disappeared while we were getting from there to ll_file_open.
359 * But this means this file was closed and immediatelly opened which
360 * makes a good candidate for using OPEN lock */
361 /* If lmmsize & lmm are not 0, we are just setting stripe info
362 * parameters. No need for the open lock */
363 if (lmm == NULL && lmmsize == 0) {
364 itp->it_flags |= MDS_OPEN_LOCK;
365 if (itp->it_flags & FMODE_WRITE)
366 opc = LUSTRE_OPC_CREATE;
369 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
370 file->f_dentry->d_inode, name, len,
373 RETURN(PTR_ERR(op_data));
375 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
376 0 /*unused */, &req, ll_md_blocking_ast, 0);
377 ll_finish_md_op_data(op_data);
379 /* reason for keep own exit path - don`t flood log
380 * with messages with -ESTALE errors.
382 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
383 it_open_error(DISP_OPEN_OPEN, itp))
385 ll_release_openhandle(file->f_dentry, itp);
389 if (it_disposition(itp, DISP_LOOKUP_NEG))
390 GOTO(out, rc = -ENOENT);
392 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
393 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
394 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
398 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
399 if (!rc && itp->d.lustre.it_lock_mode)
400 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
404 ptlrpc_req_finished(itp->d.lustre.it_data);
405 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
406 ll_intent_drop_lock(itp);
412 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
413 * not believe attributes if a few ioepoch holders exist. Attributes for
414 * previous ioepoch if new one is opened are also skipped by MDS.
416 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
418 if (ioepoch && lli->lli_ioepoch != ioepoch) {
419 lli->lli_ioepoch = ioepoch;
420 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
421 ioepoch, PFID(&lli->lli_fid));
425 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
426 struct lookup_intent *it, struct obd_client_handle *och)
428 struct ptlrpc_request *req = it->d.lustre.it_data;
429 struct mdt_body *body;
433 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
434 LASSERT(body != NULL); /* reply already checked out */
436 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
437 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
438 och->och_fid = lli->lli_fid;
439 och->och_flags = it->it_flags;
440 ll_ioepoch_open(lli, body->ioepoch);
442 return md_set_open_replay_data(md_exp, och, req);
445 int ll_local_open(struct file *file, struct lookup_intent *it,
446 struct ll_file_data *fd, struct obd_client_handle *och)
448 struct inode *inode = file->f_dentry->d_inode;
449 struct ll_inode_info *lli = ll_i2info(inode);
452 LASSERT(!LUSTRE_FPRIVATE(file));
457 struct ptlrpc_request *req = it->d.lustre.it_data;
458 struct mdt_body *body;
461 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
465 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
466 if ((it->it_flags & FMODE_WRITE) &&
467 (body->valid & OBD_MD_FLSIZE))
468 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
469 lli->lli_ioepoch, PFID(&lli->lli_fid));
472 LUSTRE_FPRIVATE(file) = fd;
473 ll_readahead_init(inode, &fd->fd_ras);
474 fd->fd_omode = it->it_flags;
478 /* Open a file, and (for the very first open) create objects on the OSTs at
479 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
480 * creation or open until ll_lov_setstripe() ioctl is called.
482 * If we already have the stripe MD locally then we don't request it in
483 * md_open(), by passing a lmm_size = 0.
485 * It is up to the application to ensure no other processes open this file
486 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
487 * used. We might be able to avoid races of that sort by getting lli_open_sem
488 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
489 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
491 int ll_file_open(struct inode *inode, struct file *file)
493 struct ll_inode_info *lli = ll_i2info(inode);
494 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
495 .it_flags = file->f_flags };
496 struct obd_client_handle **och_p = NULL;
497 __u64 *och_usecount = NULL;
498 struct ll_file_data *fd;
499 int rc = 0, opendir_set = 0;
502 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
503 inode->i_generation, inode, file->f_flags);
505 it = file->private_data; /* XXX: compat macro */
506 file->private_data = NULL; /* prevent ll_local_open assertion */
508 fd = ll_file_data_get();
510 GOTO(out_och_free, rc = -ENOMEM);
513 if (S_ISDIR(inode->i_mode)) {
514 cfs_spin_lock(&lli->lli_sa_lock);
515 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
516 lli->lli_opendir_pid == 0) {
517 lli->lli_opendir_key = fd;
518 lli->lli_opendir_pid = cfs_curproc_pid();
521 cfs_spin_unlock(&lli->lli_sa_lock);
524 if (inode->i_sb->s_root == file->f_dentry) {
525 LUSTRE_FPRIVATE(file) = fd;
529 if (!it || !it->d.lustre.it_disposition) {
530 /* Convert f_flags into access mode. We cannot use file->f_mode,
531 * because everything but O_ACCMODE mask was stripped from
533 if ((oit.it_flags + 1) & O_ACCMODE)
535 if (file->f_flags & O_TRUNC)
536 oit.it_flags |= FMODE_WRITE;
538 /* kernel only call f_op->open in dentry_open. filp_open calls
539 * dentry_open after call to open_namei that checks permissions.
540 * Only nfsd_open call dentry_open directly without checking
541 * permissions and because of that this code below is safe. */
542 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
543 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
545 /* We do not want O_EXCL here, presumably we opened the file
546 * already? XXX - NFS implications? */
547 oit.it_flags &= ~O_EXCL;
549 /* bug20584, if "it_flags" contains O_CREAT, the file will be
550 * created if necessary, then "IT_CREAT" should be set to keep
551 * consistent with it */
552 if (oit.it_flags & O_CREAT)
553 oit.it_op |= IT_CREAT;
559 /* Let's see if we have file open on MDS already. */
560 if (it->it_flags & FMODE_WRITE) {
561 och_p = &lli->lli_mds_write_och;
562 och_usecount = &lli->lli_open_fd_write_count;
563 } else if (it->it_flags & FMODE_EXEC) {
564 och_p = &lli->lli_mds_exec_och;
565 och_usecount = &lli->lli_open_fd_exec_count;
567 och_p = &lli->lli_mds_read_och;
568 och_usecount = &lli->lli_open_fd_read_count;
571 cfs_mutex_lock(&lli->lli_och_mutex);
572 if (*och_p) { /* Open handle is present */
573 if (it_disposition(it, DISP_OPEN_OPEN)) {
574 /* Well, there's extra open request that we do not need,
575 let's close it somehow. This will decref request. */
576 rc = it_open_error(DISP_OPEN_OPEN, it);
578 cfs_mutex_unlock(&lli->lli_och_mutex);
579 GOTO(out_openerr, rc);
582 ll_release_openhandle(file->f_dentry, it);
586 rc = ll_local_open(file, it, fd, NULL);
589 cfs_mutex_unlock(&lli->lli_och_mutex);
590 GOTO(out_openerr, rc);
593 LASSERT(*och_usecount == 0);
594 if (!it->d.lustre.it_disposition) {
595 /* We cannot just request lock handle now, new ELC code
596 means that one of other OPEN locks for this file
597 could be cancelled, and since blocking ast handler
598 would attempt to grab och_mutex as well, that would
599 result in a deadlock */
600 cfs_mutex_unlock(&lli->lli_och_mutex);
601 it->it_create_mode |= M_CHECK_STALE;
602 rc = ll_intent_file_open(file, NULL, 0, it);
603 it->it_create_mode &= ~M_CHECK_STALE;
605 GOTO(out_openerr, rc);
609 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
611 GOTO(out_och_free, rc = -ENOMEM);
615 /* md_intent_lock() didn't get a request ref if there was an
616 * open error, so don't do cleanup on the request here
618 /* XXX (green): Should not we bail out on any error here, not
619 * just open error? */
620 rc = it_open_error(DISP_OPEN_OPEN, it);
622 GOTO(out_och_free, rc);
624 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
626 rc = ll_local_open(file, it, fd, *och_p);
628 GOTO(out_och_free, rc);
630 cfs_mutex_unlock(&lli->lli_och_mutex);
633 /* Must do this outside lli_och_mutex lock to prevent deadlock where
634 different kind of OPEN lock for this same inode gets cancelled
635 by ldlm_cancel_lru */
636 if (!S_ISREG(inode->i_mode))
637 GOTO(out_och_free, rc);
641 if (!lli->lli_has_smd) {
642 if (file->f_flags & O_LOV_DELAY_CREATE ||
643 !(file->f_mode & FMODE_WRITE)) {
644 CDEBUG(D_INODE, "object creation was delayed\n");
645 GOTO(out_och_free, rc);
648 file->f_flags &= ~O_LOV_DELAY_CREATE;
649 GOTO(out_och_free, rc);
652 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
653 ptlrpc_req_finished(it->d.lustre.it_data);
654 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
658 if (och_p && *och_p) {
659 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
660 *och_p = NULL; /* OBD_FREE writes some magic there */
663 cfs_mutex_unlock(&lli->lli_och_mutex);
666 if (opendir_set != 0)
667 ll_stop_statahead(inode, lli->lli_opendir_key);
669 ll_file_data_put(fd);
671 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
677 /* Fills the obdo with the attributes for the lsm */
678 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
679 struct obd_capa *capa, struct obdo *obdo,
680 __u64 ioepoch, int sync)
682 struct ptlrpc_request_set *set;
683 struct obd_info oinfo = { { { 0 } } };
688 LASSERT(lsm != NULL);
692 oinfo.oi_oa->o_id = lsm->lsm_object_id;
693 oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
694 oinfo.oi_oa->o_mode = S_IFREG;
695 oinfo.oi_oa->o_ioepoch = ioepoch;
696 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
697 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
698 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
699 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
700 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
701 OBD_MD_FLDATAVERSION;
702 oinfo.oi_capa = capa;
704 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
705 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
708 set = ptlrpc_prep_set();
710 CERROR("can't allocate ptlrpc set\n");
713 rc = obd_getattr_async(exp, &oinfo, set);
715 rc = ptlrpc_set_wait(set);
716 ptlrpc_set_destroy(set);
719 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
720 OBD_MD_FLATIME | OBD_MD_FLMTIME |
721 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
722 OBD_MD_FLDATAVERSION);
727 * Performs the getattr on the inode and updates its fields.
728 * If @sync != 0, perform the getattr under the server-side lock.
730 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
731 __u64 ioepoch, int sync)
733 struct obd_capa *capa = ll_mdscapa_get(inode);
734 struct lov_stripe_md *lsm;
738 lsm = ccc_inode_lsm_get(inode);
739 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
740 capa, obdo, ioepoch, sync);
743 obdo_refresh_inode(inode, obdo, obdo->o_valid);
745 "objid "LPX64" size %llu, blocks %llu, blksize %lu\n",
746 lsm ? lsm->lsm_object_id : 0, i_size_read(inode),
747 (unsigned long long)inode->i_blocks,
748 (unsigned long)ll_inode_blksize(inode));
750 ccc_inode_lsm_put(inode, lsm);
754 int ll_merge_lvb(struct inode *inode)
756 struct ll_inode_info *lli = ll_i2info(inode);
757 struct ll_sb_info *sbi = ll_i2sbi(inode);
758 struct lov_stripe_md *lsm;
764 lsm = ccc_inode_lsm_get(inode);
765 ll_inode_size_lock(inode);
766 inode_init_lvb(inode, &lvb);
768 /* merge timestamps the most resently obtained from mds with
769 timestamps obtained from osts */
770 lvb.lvb_atime = lli->lli_lvb.lvb_atime;
771 lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
772 lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
774 rc = obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
775 cl_isize_write_nolock(inode, lvb.lvb_size);
777 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
778 PFID(&lli->lli_fid), lvb.lvb_size);
779 inode->i_blocks = lvb.lvb_blocks;
781 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
782 LTIME_S(inode->i_atime) = lvb.lvb_atime;
783 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
785 ll_inode_size_unlock(inode);
786 ccc_inode_lsm_put(inode, lsm);
791 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
794 struct obdo obdo = { 0 };
797 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
799 st->st_size = obdo.o_size;
800 st->st_blocks = obdo.o_blocks;
801 st->st_mtime = obdo.o_mtime;
802 st->st_atime = obdo.o_atime;
803 st->st_ctime = obdo.o_ctime;
808 void ll_io_init(struct cl_io *io, const struct file *file, int write)
810 struct inode *inode = file->f_dentry->d_inode;
812 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
814 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
815 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC || IS_SYNC(inode);
817 io->ci_obj = ll_i2info(inode)->lli_clob;
818 io->ci_lockreq = CILR_MAYBE;
819 if (ll_file_nolock(file)) {
820 io->ci_lockreq = CILR_NEVER;
821 io->ci_no_srvlock = 1;
822 } else if (file->f_flags & O_APPEND) {
823 io->ci_lockreq = CILR_MANDATORY;
827 static ssize_t ll_file_io_generic(const struct lu_env *env,
828 struct vvp_io_args *args, struct file *file,
829 enum cl_io_type iot, loff_t *ppos, size_t count)
831 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
836 io = ccc_env_thread_io(env);
837 ll_io_init(io, file, iot == CIT_WRITE);
839 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
840 struct vvp_io *vio = vvp_env_io(env);
841 struct ccc_io *cio = ccc_env_io(env);
842 int write_mutex_locked = 0;
844 cio->cui_fd = LUSTRE_FPRIVATE(file);
845 vio->cui_io_subtype = args->via_io_subtype;
847 switch (vio->cui_io_subtype) {
849 cio->cui_iov = args->u.normal.via_iov;
850 cio->cui_nrsegs = args->u.normal.via_nrsegs;
851 cio->cui_tot_nrsegs = cio->cui_nrsegs;
852 #ifndef HAVE_FILE_WRITEV
853 cio->cui_iocb = args->u.normal.via_iocb;
855 if ((iot == CIT_WRITE) &&
856 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
857 if (cfs_mutex_lock_interruptible(&lli->
859 GOTO(out, result = -ERESTARTSYS);
860 write_mutex_locked = 1;
861 } else if (iot == CIT_READ) {
862 cfs_down_read(&lli->lli_trunc_sem);
866 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
867 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
870 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
871 vio->u.splice.cui_flags = args->u.splice.via_flags;
874 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
877 result = cl_io_loop(env, io);
878 if (write_mutex_locked)
879 cfs_mutex_unlock(&lli->lli_write_mutex);
880 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
881 cfs_up_read(&lli->lli_trunc_sem);
883 /* cl_io_rw_init() handled IO */
884 result = io->ci_result;
887 if (io->ci_nob > 0) {
889 *ppos = io->u.ci_wr.wr.crw_pos;
895 if (iot == CIT_READ) {
897 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
898 LPROC_LL_READ_BYTES, result);
899 } else if (iot == CIT_WRITE) {
901 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
902 LPROC_LL_WRITE_BYTES, result);
903 lli->lli_write_rc = 0;
905 lli->lli_write_rc = result;
914 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
916 static int ll_file_get_iov_count(const struct iovec *iov,
917 unsigned long *nr_segs, size_t *count)
922 for (seg = 0; seg < *nr_segs; seg++) {
923 const struct iovec *iv = &iov[seg];
926 * If any segment has a negative length, or the cumulative
927 * length ever wraps negative then return -EINVAL.
930 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
932 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
937 cnt -= iv->iov_len; /* This segment is no good */
944 #ifdef HAVE_FILE_READV
945 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
946 unsigned long nr_segs, loff_t *ppos)
949 struct vvp_io_args *args;
955 result = ll_file_get_iov_count(iov, &nr_segs, &count);
959 env = cl_env_get(&refcheck);
961 RETURN(PTR_ERR(env));
963 args = vvp_env_args(env, IO_NORMAL);
964 args->u.normal.via_iov = (struct iovec *)iov;
965 args->u.normal.via_nrsegs = nr_segs;
967 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
968 cl_env_put(env, &refcheck);
972 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
976 struct iovec *local_iov;
981 env = cl_env_get(&refcheck);
983 RETURN(PTR_ERR(env));
985 local_iov = &vvp_env_info(env)->vti_local_iov;
986 local_iov->iov_base = (void __user *)buf;
987 local_iov->iov_len = count;
988 result = ll_file_readv(file, local_iov, 1, ppos);
989 cl_env_put(env, &refcheck);
994 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
995 unsigned long nr_segs, loff_t pos)
998 struct vvp_io_args *args;
1004 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1008 env = cl_env_get(&refcheck);
1010 RETURN(PTR_ERR(env));
1012 args = vvp_env_args(env, IO_NORMAL);
1013 args->u.normal.via_iov = (struct iovec *)iov;
1014 args->u.normal.via_nrsegs = nr_segs;
1015 args->u.normal.via_iocb = iocb;
1017 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1018 &iocb->ki_pos, count);
1019 cl_env_put(env, &refcheck);
1023 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1027 struct iovec *local_iov;
1028 struct kiocb *kiocb;
1033 env = cl_env_get(&refcheck);
1035 RETURN(PTR_ERR(env));
1037 local_iov = &vvp_env_info(env)->vti_local_iov;
1038 kiocb = &vvp_env_info(env)->vti_kiocb;
1039 local_iov->iov_base = (void __user *)buf;
1040 local_iov->iov_len = count;
1041 init_sync_kiocb(kiocb, file);
1042 kiocb->ki_pos = *ppos;
1043 kiocb->ki_left = count;
1045 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1046 *ppos = kiocb->ki_pos;
1048 cl_env_put(env, &refcheck);
1054 * Write to a file (through the page cache).
1056 #ifdef HAVE_FILE_WRITEV
1057 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1058 unsigned long nr_segs, loff_t *ppos)
1061 struct vvp_io_args *args;
1067 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1071 env = cl_env_get(&refcheck);
1073 RETURN(PTR_ERR(env));
1075 args = vvp_env_args(env, IO_NORMAL);
1076 args->u.normal.via_iov = (struct iovec *)iov;
1077 args->u.normal.via_nrsegs = nr_segs;
1079 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1080 cl_env_put(env, &refcheck);
1084 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1088 struct iovec *local_iov;
1093 env = cl_env_get(&refcheck);
1095 RETURN(PTR_ERR(env));
1097 local_iov = &vvp_env_info(env)->vti_local_iov;
1098 local_iov->iov_base = (void __user *)buf;
1099 local_iov->iov_len = count;
1101 result = ll_file_writev(file, local_iov, 1, ppos);
1102 cl_env_put(env, &refcheck);
1106 #else /* AIO stuff */
1107 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1108 unsigned long nr_segs, loff_t pos)
1111 struct vvp_io_args *args;
1117 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1121 env = cl_env_get(&refcheck);
1123 RETURN(PTR_ERR(env));
1125 args = vvp_env_args(env, IO_NORMAL);
1126 args->u.normal.via_iov = (struct iovec *)iov;
1127 args->u.normal.via_nrsegs = nr_segs;
1128 args->u.normal.via_iocb = iocb;
1130 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1131 &iocb->ki_pos, count);
1132 cl_env_put(env, &refcheck);
1136 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1140 struct iovec *local_iov;
1141 struct kiocb *kiocb;
1146 env = cl_env_get(&refcheck);
1148 RETURN(PTR_ERR(env));
1150 local_iov = &vvp_env_info(env)->vti_local_iov;
1151 kiocb = &vvp_env_info(env)->vti_kiocb;
1152 local_iov->iov_base = (void __user *)buf;
1153 local_iov->iov_len = count;
1154 init_sync_kiocb(kiocb, file);
1155 kiocb->ki_pos = *ppos;
1156 kiocb->ki_left = count;
1158 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1159 *ppos = kiocb->ki_pos;
1161 cl_env_put(env, &refcheck);
1167 #ifdef HAVE_KERNEL_SENDFILE
1169 * Send file content (through pagecache) somewhere with helper
1171 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1172 read_actor_t actor, void *target)
1175 struct vvp_io_args *args;
1180 env = cl_env_get(&refcheck);
1182 RETURN(PTR_ERR(env));
1184 args = vvp_env_args(env, IO_SENDFILE);
1185 args->u.sendfile.via_target = target;
1186 args->u.sendfile.via_actor = actor;
1188 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1189 cl_env_put(env, &refcheck);
1194 #ifdef HAVE_KERNEL_SPLICE_READ
1196 * Send file content (through pagecache) somewhere with helper
1198 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1199 struct pipe_inode_info *pipe, size_t count,
1203 struct vvp_io_args *args;
1208 env = cl_env_get(&refcheck);
1210 RETURN(PTR_ERR(env));
1212 args = vvp_env_args(env, IO_SPLICE);
1213 args->u.splice.via_pipe = pipe;
1214 args->u.splice.via_flags = flags;
1216 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1217 cl_env_put(env, &refcheck);
1222 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1225 struct obd_export *exp = ll_i2dtexp(inode);
1226 struct obd_trans_info oti = { 0 };
1227 struct obdo *oa = NULL;
1230 struct lov_stripe_md *lsm = NULL, *lsm2;
1237 lsm = ccc_inode_lsm_get(inode);
1239 GOTO(out, rc = -ENOENT);
1241 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1242 (lsm->lsm_stripe_count));
1244 OBD_ALLOC_LARGE(lsm2, lsm_size);
1246 GOTO(out, rc = -ENOMEM);
1250 oa->o_nlink = ost_idx;
1251 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1252 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1253 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1254 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1255 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1256 memcpy(lsm2, lsm, lsm_size);
1257 ll_inode_size_lock(inode);
1258 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1259 ll_inode_size_unlock(inode);
1261 OBD_FREE_LARGE(lsm2, lsm_size);
1264 ccc_inode_lsm_put(inode, lsm);
1269 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1271 struct ll_recreate_obj ucreat;
1274 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1277 if (cfs_copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1278 sizeof(struct ll_recreate_obj)))
1281 RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1282 ucreat.lrc_ost_idx));
1285 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1292 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1295 if (cfs_copy_from_user(&fid, (struct lu_fid *)arg,
1296 sizeof(struct lu_fid)))
1299 id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1300 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1301 RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1304 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1305 int flags, struct lov_user_md *lum, int lum_size)
1307 struct lov_stripe_md *lsm = NULL;
1308 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1312 lsm = ccc_inode_lsm_get(inode);
1314 ccc_inode_lsm_put(inode, lsm);
1315 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1320 ll_inode_size_lock(inode);
1321 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1324 rc = oit.d.lustre.it_status;
1326 GOTO(out_req_free, rc);
1328 ll_release_openhandle(file->f_dentry, &oit);
1331 ll_inode_size_unlock(inode);
1332 ll_intent_release(&oit);
1333 ccc_inode_lsm_put(inode, lsm);
1336 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1340 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1341 struct lov_mds_md **lmmp, int *lmm_size,
1342 struct ptlrpc_request **request)
1344 struct ll_sb_info *sbi = ll_i2sbi(inode);
1345 struct mdt_body *body;
1346 struct lov_mds_md *lmm = NULL;
1347 struct ptlrpc_request *req = NULL;
1348 struct md_op_data *op_data;
1351 rc = ll_get_max_mdsize(sbi, &lmmsize);
1355 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1356 strlen(filename), lmmsize,
1357 LUSTRE_OPC_ANY, NULL);
1358 if (IS_ERR(op_data))
1359 RETURN(PTR_ERR(op_data));
1361 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1362 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1363 ll_finish_md_op_data(op_data);
1365 CDEBUG(D_INFO, "md_getattr_name failed "
1366 "on %s: rc %d\n", filename, rc);
1370 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1371 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1373 lmmsize = body->eadatasize;
1375 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1377 GOTO(out, rc = -ENODATA);
1380 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1381 LASSERT(lmm != NULL);
1383 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1384 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1385 GOTO(out, rc = -EPROTO);
1389 * This is coming from the MDS, so is probably in
1390 * little endian. We convert it to host endian before
1391 * passing it to userspace.
1393 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1394 /* if function called for directory - we should
1395 * avoid swab not existent lsm objects */
1396 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1397 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1398 if (S_ISREG(body->mode))
1399 lustre_swab_lov_user_md_objects(
1400 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1401 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1402 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1403 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1404 if (S_ISREG(body->mode))
1405 lustre_swab_lov_user_md_objects(
1406 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1407 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1413 *lmm_size = lmmsize;
1418 static int ll_lov_setea(struct inode *inode, struct file *file,
1421 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1422 struct lov_user_md *lump;
1423 int lum_size = sizeof(struct lov_user_md) +
1424 sizeof(struct lov_user_ost_data);
1428 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1431 OBD_ALLOC_LARGE(lump, lum_size);
1435 if (cfs_copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1436 OBD_FREE_LARGE(lump, lum_size);
1440 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1442 OBD_FREE_LARGE(lump, lum_size);
1446 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1449 struct lov_user_md_v3 lumv3;
1450 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1451 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1452 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1455 int flags = FMODE_WRITE;
1458 /* first try with v1 which is smaller than v3 */
1459 lum_size = sizeof(struct lov_user_md_v1);
1460 if (cfs_copy_from_user(lumv1, lumv1p, lum_size))
1463 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1464 lum_size = sizeof(struct lov_user_md_v3);
1465 if (cfs_copy_from_user(&lumv3, lumv3p, lum_size))
1469 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1471 struct lov_stripe_md *lsm;
1472 put_user(0, &lumv1p->lmm_stripe_count);
1473 lsm = ccc_inode_lsm_get(inode);
1474 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1475 0, lsm, (void *)arg);
1476 ccc_inode_lsm_put(inode, lsm);
1481 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1483 struct lov_stripe_md *lsm;
1487 lsm = ccc_inode_lsm_get(inode);
1489 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1491 ccc_inode_lsm_put(inode, lsm);
1495 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1497 struct ll_inode_info *lli = ll_i2info(inode);
1498 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1499 struct ccc_grouplock grouplock;
1503 if (ll_file_nolock(file))
1504 RETURN(-EOPNOTSUPP);
1506 cfs_spin_lock(&lli->lli_lock);
1507 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1508 CWARN("group lock already existed with gid %lu\n",
1509 fd->fd_grouplock.cg_gid);
1510 cfs_spin_unlock(&lli->lli_lock);
1513 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1514 cfs_spin_unlock(&lli->lli_lock);
1516 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1517 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1521 cfs_spin_lock(&lli->lli_lock);
1522 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1523 cfs_spin_unlock(&lli->lli_lock);
1524 CERROR("another thread just won the race\n");
1525 cl_put_grouplock(&grouplock);
1529 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1530 fd->fd_grouplock = grouplock;
1531 cfs_spin_unlock(&lli->lli_lock);
1533 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1537 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1539 struct ll_inode_info *lli = ll_i2info(inode);
1540 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1541 struct ccc_grouplock grouplock;
1544 cfs_spin_lock(&lli->lli_lock);
1545 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1546 cfs_spin_unlock(&lli->lli_lock);
1547 CWARN("no group lock held\n");
1550 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1552 if (fd->fd_grouplock.cg_gid != arg) {
1553 CWARN("group lock %lu doesn't match current id %lu\n",
1554 arg, fd->fd_grouplock.cg_gid);
1555 cfs_spin_unlock(&lli->lli_lock);
1559 grouplock = fd->fd_grouplock;
1560 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1561 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1562 cfs_spin_unlock(&lli->lli_lock);
1564 cl_put_grouplock(&grouplock);
1565 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1570 * Close inode open handle
1572 * \param dentry [in] dentry which contains the inode
1573 * \param it [in,out] intent which contains open info and result
1576 * \retval <0 failure
1578 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1580 struct inode *inode = dentry->d_inode;
1581 struct obd_client_handle *och;
1587 /* Root ? Do nothing. */
1588 if (dentry->d_inode->i_sb->s_root == dentry)
1591 /* No open handle to close? Move away */
1592 if (!it_disposition(it, DISP_OPEN_OPEN))
1595 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1597 OBD_ALLOC(och, sizeof(*och));
1599 GOTO(out, rc = -ENOMEM);
1601 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1602 ll_i2info(inode), it, och);
1604 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1607 /* this one is in place of ll_file_open */
1608 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1609 ptlrpc_req_finished(it->d.lustre.it_data);
1610 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1616 * Get size for inode for which FIEMAP mapping is requested.
1617 * Make the FIEMAP get_info call and returns the result.
1619 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1622 struct obd_export *exp = ll_i2dtexp(inode);
1623 struct lov_stripe_md *lsm = NULL;
1624 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1625 int vallen = num_bytes;
1629 /* Checks for fiemap flags */
1630 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1631 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1635 /* Check for FIEMAP_FLAG_SYNC */
1636 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1637 rc = filemap_fdatawrite(inode->i_mapping);
1642 lsm = ccc_inode_lsm_get(inode);
1646 /* If the stripe_count > 1 and the application does not understand
1647 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1649 if (lsm->lsm_stripe_count > 1 &&
1650 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1651 GOTO(out, rc = -EOPNOTSUPP);
1653 fm_key.oa.o_id = lsm->lsm_object_id;
1654 fm_key.oa.o_seq = lsm->lsm_object_seq;
1655 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1657 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1658 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1659 /* If filesize is 0, then there would be no objects for mapping */
1660 if (fm_key.oa.o_size == 0) {
1661 fiemap->fm_mapped_extents = 0;
1665 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1667 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1670 CERROR("obd_get_info failed: rc = %d\n", rc);
1673 ccc_inode_lsm_put(inode, lsm);
1677 int ll_fid2path(struct obd_export *exp, void *arg)
1679 struct getinfo_fid2path *gfout, *gfin;
1683 /* Need to get the buflen */
1684 OBD_ALLOC_PTR(gfin);
1687 if (cfs_copy_from_user(gfin, arg, sizeof(*gfin))) {
1692 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1693 OBD_ALLOC(gfout, outsize);
1694 if (gfout == NULL) {
1698 memcpy(gfout, gfin, sizeof(*gfout));
1701 /* Call mdc_iocontrol */
1702 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1705 if (cfs_copy_to_user(arg, gfout, outsize))
1709 OBD_FREE(gfout, outsize);
1713 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1715 struct ll_user_fiemap *fiemap_s;
1716 size_t num_bytes, ret_bytes;
1717 unsigned int extent_count;
1720 /* Get the extent count so we can calculate the size of
1721 * required fiemap buffer */
1722 if (get_user(extent_count,
1723 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1725 num_bytes = sizeof(*fiemap_s) + (extent_count *
1726 sizeof(struct ll_fiemap_extent));
1728 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1729 if (fiemap_s == NULL)
1732 /* get the fiemap value */
1733 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1735 GOTO(error, rc = -EFAULT);
1737 /* If fm_extent_count is non-zero, read the first extent since
1738 * it is used to calculate end_offset and device from previous
1741 if (copy_from_user(&fiemap_s->fm_extents[0],
1742 (char __user *)arg + sizeof(*fiemap_s),
1743 sizeof(struct ll_fiemap_extent)))
1744 GOTO(error, rc = -EFAULT);
1747 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1751 ret_bytes = sizeof(struct ll_user_fiemap);
1753 if (extent_count != 0)
1754 ret_bytes += (fiemap_s->fm_mapped_extents *
1755 sizeof(struct ll_fiemap_extent));
1757 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1761 OBD_FREE_LARGE(fiemap_s, num_bytes);
1766 * Read the data_version for inode.
1768 * This value is computed using stripe object version on OST.
1769 * Version is computed using server side locking.
1771 * @param extent_lock Take extent lock. Not needed if a process is already
1772 * holding the OST object group locks.
1774 static int ll_data_version(struct inode *inode, __u64 *data_version,
1777 struct lov_stripe_md *lsm = NULL;
1778 struct ll_sb_info *sbi = ll_i2sbi(inode);
1779 struct obdo *obdo = NULL;
1783 /* If no stripe, we consider version is 0. */
1784 lsm = ccc_inode_lsm_get(inode);
1787 CDEBUG(D_INODE, "No object for inode\n");
1791 OBD_ALLOC_PTR(obdo);
1793 ccc_inode_lsm_put(inode, lsm);
1797 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1799 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1802 *data_version = obdo->o_data_version;
1806 ccc_inode_lsm_put(inode, lsm);
1811 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1813 struct inode *inode = file->f_dentry->d_inode;
1814 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1819 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1820 inode->i_generation, inode, cmd);
1821 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1823 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1824 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1828 case LL_IOC_GETFLAGS:
1829 /* Get the current value of the file flags */
1830 return put_user(fd->fd_flags, (int *)arg);
1831 case LL_IOC_SETFLAGS:
1832 case LL_IOC_CLRFLAGS:
1833 /* Set or clear specific file flags */
1834 /* XXX This probably needs checks to ensure the flags are
1835 * not abused, and to handle any flag side effects.
1837 if (get_user(flags, (int *) arg))
1840 if (cmd == LL_IOC_SETFLAGS) {
1841 if ((flags & LL_FILE_IGNORE_LOCK) &&
1842 !(file->f_flags & O_DIRECT)) {
1843 CERROR("%s: unable to disable locking on "
1844 "non-O_DIRECT file\n", current->comm);
1848 fd->fd_flags |= flags;
1850 fd->fd_flags &= ~flags;
1853 case LL_IOC_LOV_SETSTRIPE:
1854 RETURN(ll_lov_setstripe(inode, file, arg));
1855 case LL_IOC_LOV_SETEA:
1856 RETURN(ll_lov_setea(inode, file, arg));
1857 case LL_IOC_LOV_GETSTRIPE:
1858 RETURN(ll_lov_getstripe(inode, arg));
1859 case LL_IOC_RECREATE_OBJ:
1860 RETURN(ll_lov_recreate_obj(inode, arg));
1861 case LL_IOC_RECREATE_FID:
1862 RETURN(ll_lov_recreate_fid(inode, arg));
1863 case FSFILT_IOC_FIEMAP:
1864 RETURN(ll_ioctl_fiemap(inode, arg));
1865 case FSFILT_IOC_GETFLAGS:
1866 case FSFILT_IOC_SETFLAGS:
1867 RETURN(ll_iocontrol(inode, file, cmd, arg));
1868 case FSFILT_IOC_GETVERSION_OLD:
1869 case FSFILT_IOC_GETVERSION:
1870 RETURN(put_user(inode->i_generation, (int *)arg));
1871 case LL_IOC_GROUP_LOCK:
1872 RETURN(ll_get_grouplock(inode, file, arg));
1873 case LL_IOC_GROUP_UNLOCK:
1874 RETURN(ll_put_grouplock(inode, file, arg));
1875 case IOC_OBD_STATFS:
1876 RETURN(ll_obd_statfs(inode, (void *)arg));
1878 /* We need to special case any other ioctls we want to handle,
1879 * to send them to the MDS/OST as appropriate and to properly
1880 * network encode the arg field.
1881 case FSFILT_IOC_SETVERSION_OLD:
1882 case FSFILT_IOC_SETVERSION:
1884 case LL_IOC_FLUSHCTX:
1885 RETURN(ll_flush_ctx(inode));
1886 case LL_IOC_PATH2FID: {
1887 if (cfs_copy_to_user((void *)arg, ll_inode2fid(inode),
1888 sizeof(struct lu_fid)))
1893 case OBD_IOC_FID2PATH:
1894 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1895 case LL_IOC_DATA_VERSION: {
1896 struct ioc_data_version idv;
1899 if (cfs_copy_from_user(&idv, (char *)arg, sizeof(idv)))
1902 rc = ll_data_version(inode, &idv.idv_version,
1903 !(idv.idv_flags & LL_DV_NOFLUSH));
1906 cfs_copy_to_user((char *) arg, &idv, sizeof(idv)))
1912 case LL_IOC_GET_MDTIDX: {
1915 mdtidx = ll_get_mdt_idx(inode);
1919 if (put_user((int)mdtidx, (int*)arg))
1924 case OBD_IOC_GETDTNAME:
1925 case OBD_IOC_GETMDNAME:
1926 RETURN(ll_get_obd_name(inode, cmd, arg));
1931 ll_iocontrol_call(inode, file, cmd, arg, &err))
1934 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1940 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1942 struct inode *inode = file->f_dentry->d_inode;
1945 retval = offset + ((origin == 2) ? i_size_read(inode) :
1946 (origin == 1) ? file->f_pos : 0);
1947 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%s)\n",
1948 inode->i_ino, inode->i_generation, inode, retval, retval,
1949 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1950 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1952 if (origin == 2) { /* SEEK_END */
1955 rc = ll_glimpse_size(inode);
1959 offset += i_size_read(inode);
1960 } else if (origin == 1) { /* SEEK_CUR */
1961 offset += file->f_pos;
1965 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1966 if (offset != file->f_pos) {
1967 file->f_pos = offset;
1975 int ll_flush(struct file *file, fl_owner_t id)
1977 struct inode *inode = file->f_dentry->d_inode;
1978 struct ll_inode_info *lli = ll_i2info(inode);
1981 LASSERT(!S_ISDIR(inode->i_mode));
1983 /* the application should know write failure already. */
1984 if (lli->lli_write_rc)
1987 /* catch async errors that were recorded back when async writeback
1988 * failed for pages in this mapping. */
1989 rc = lli->lli_async_rc;
1990 lli->lli_async_rc = 0;
1991 err = lov_read_and_clear_async_rc(lli->lli_clob);
1995 return rc ? -EIO : 0;
1999 * Called to make sure a portion of file has been written out.
2000 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2002 * Return how many pages have been written.
2004 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2005 enum cl_fsync_mode mode)
2007 struct cl_env_nest nest;
2010 struct obd_capa *capa = NULL;
2011 struct cl_fsync_io *fio;
2015 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2016 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2019 env = cl_env_nested_get(&nest);
2021 RETURN(PTR_ERR(env));
2023 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2025 io = ccc_env_thread_io(env);
2026 io->ci_obj = cl_i2info(inode)->lli_clob;
2027 io->ci_ignore_layout = 1;
2029 /* initialize parameters for sync */
2030 fio = &io->u.ci_fsync;
2031 fio->fi_capa = capa;
2032 fio->fi_start = start;
2034 fio->fi_fid = ll_inode2fid(inode);
2035 fio->fi_mode = mode;
2036 fio->fi_nr_written = 0;
2038 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2039 result = cl_io_loop(env, io);
2041 result = io->ci_result;
2043 result = fio->fi_nr_written;
2044 cl_io_fini(env, io);
2045 cl_env_nested_put(&nest, env);
2052 #ifdef HAVE_FILE_FSYNC_4ARGS
2053 int ll_fsync(struct file *file, loff_t start, loff_t end, int data)
2054 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2055 int ll_fsync(struct file *file, int data)
2057 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2060 struct inode *inode = file->f_dentry->d_inode;
2061 struct ll_inode_info *lli = ll_i2info(inode);
2062 struct ptlrpc_request *req;
2063 struct obd_capa *oc;
2064 struct lov_stripe_md *lsm;
2067 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2068 inode->i_generation, inode);
2069 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2071 /* fsync's caller has already called _fdata{sync,write}, we want
2072 * that IO to finish before calling the osc and mdc sync methods */
2073 rc = filemap_fdatawait(inode->i_mapping);
2075 /* catch async errors that were recorded back when async writeback
2076 * failed for pages in this mapping. */
2077 if (!S_ISDIR(inode->i_mode)) {
2078 err = lli->lli_async_rc;
2079 lli->lli_async_rc = 0;
2082 err = lov_read_and_clear_async_rc(lli->lli_clob);
2087 oc = ll_mdscapa_get(inode);
2088 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2094 ptlrpc_req_finished(req);
2096 lsm = ccc_inode_lsm_get(inode);
2098 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2100 if (rc == 0 && err < 0)
2102 lli->lli_write_rc = rc < 0 ? rc : 0;
2104 ccc_inode_lsm_put(inode, lsm);
2109 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2111 struct inode *inode = file->f_dentry->d_inode;
2112 struct ll_sb_info *sbi = ll_i2sbi(inode);
2113 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2114 .ei_cb_cp =ldlm_flock_completion_ast,
2115 .ei_cbdata = file_lock };
2116 struct md_op_data *op_data;
2117 struct lustre_handle lockh = {0};
2118 ldlm_policy_data_t flock = {{0}};
2123 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2124 inode->i_ino, file_lock);
2126 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2128 if (file_lock->fl_flags & FL_FLOCK) {
2129 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2130 /* flocks are whole-file locks */
2131 flock.l_flock.end = OFFSET_MAX;
2132 /* For flocks owner is determined by the local file desctiptor*/
2133 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2134 } else if (file_lock->fl_flags & FL_POSIX) {
2135 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2136 flock.l_flock.start = file_lock->fl_start;
2137 flock.l_flock.end = file_lock->fl_end;
2141 flock.l_flock.pid = file_lock->fl_pid;
2143 /* Somewhat ugly workaround for svc lockd.
2144 * lockd installs custom fl_lmops->fl_compare_owner that checks
2145 * for the fl_owner to be the same (which it always is on local node
2146 * I guess between lockd processes) and then compares pid.
2147 * As such we assign pid to the owner field to make it all work,
2148 * conflict with normal locks is unlikely since pid space and
2149 * pointer space for current->files are not intersecting */
2150 if (file_lock->fl_lmops && file_lock->fl_lmops->fl_compare_owner)
2151 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2153 switch (file_lock->fl_type) {
2155 einfo.ei_mode = LCK_PR;
2158 /* An unlock request may or may not have any relation to
2159 * existing locks so we may not be able to pass a lock handle
2160 * via a normal ldlm_lock_cancel() request. The request may even
2161 * unlock a byte range in the middle of an existing lock. In
2162 * order to process an unlock request we need all of the same
2163 * information that is given with a normal read or write record
2164 * lock request. To avoid creating another ldlm unlock (cancel)
2165 * message we'll treat a LCK_NL flock request as an unlock. */
2166 einfo.ei_mode = LCK_NL;
2169 einfo.ei_mode = LCK_PW;
2172 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2173 file_lock->fl_type);
2188 flags = LDLM_FL_BLOCK_NOWAIT;
2194 flags = LDLM_FL_TEST_LOCK;
2195 /* Save the old mode so that if the mode in the lock changes we
2196 * can decrement the appropriate reader or writer refcount. */
2197 file_lock->fl_type = einfo.ei_mode;
2200 CERROR("unknown fcntl lock command: %d\n", cmd);
2204 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2205 LUSTRE_OPC_ANY, NULL);
2206 if (IS_ERR(op_data))
2207 RETURN(PTR_ERR(op_data));
2209 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2210 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2211 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2213 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2214 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2216 ll_finish_md_op_data(op_data);
2218 if ((file_lock->fl_flags & FL_FLOCK) &&
2219 (rc == 0 || file_lock->fl_type == F_UNLCK))
2220 flock_lock_file_wait(file, file_lock);
2221 if ((file_lock->fl_flags & FL_POSIX) &&
2222 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2223 !(flags & LDLM_FL_TEST_LOCK))
2224 posix_lock_file_wait(file, file_lock);
2229 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2237 * test if some locks matching bits and l_req_mode are acquired
2238 * - bits can be in different locks
2239 * - if found clear the common lock bits in *bits
2240 * - the bits not found, are kept in *bits
2242 * \param bits [IN] searched lock bits [IN]
2243 * \param l_req_mode [IN] searched lock mode
2244 * \retval boolean, true iff all bits are found
2246 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2248 struct lustre_handle lockh;
2249 ldlm_policy_data_t policy;
2250 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2251 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2260 fid = &ll_i2info(inode)->lli_fid;
2261 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2262 ldlm_lockname[mode]);
2264 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2265 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2266 policy.l_inodebits.bits = *bits & (1 << i);
2267 if (policy.l_inodebits.bits == 0)
2270 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2271 &policy, mode, &lockh)) {
2272 struct ldlm_lock *lock;
2274 lock = ldlm_handle2lock(&lockh);
2277 ~(lock->l_policy_data.l_inodebits.bits);
2278 LDLM_LOCK_PUT(lock);
2280 *bits &= ~policy.l_inodebits.bits;
2287 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2288 struct lustre_handle *lockh)
2290 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2296 fid = &ll_i2info(inode)->lli_fid;
2297 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2299 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2300 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2301 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2305 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2306 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2307 * and return success */
2309 /* This path cannot be hit for regular files unless in
2310 * case of obscure races, so no need to to validate
2312 if (!S_ISREG(inode->i_mode) &&
2313 !S_ISDIR(inode->i_mode))
2318 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2326 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2329 struct inode *inode = dentry->d_inode;
2330 struct ptlrpc_request *req = NULL;
2331 struct obd_export *exp;
2336 CERROR("REPORT THIS LINE TO PETER\n");
2340 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2341 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2343 exp = ll_i2mdexp(inode);
2345 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2346 * But under CMD case, it caused some lock issues, should be fixed
2347 * with new CMD ibits lock. See bug 12718 */
2348 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2349 struct lookup_intent oit = { .it_op = IT_GETATTR };
2350 struct md_op_data *op_data;
2352 if (ibits == MDS_INODELOCK_LOOKUP)
2353 oit.it_op = IT_LOOKUP;
2355 /* Call getattr by fid, so do not provide name at all. */
2356 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2357 dentry->d_inode, NULL, 0, 0,
2358 LUSTRE_OPC_ANY, NULL);
2359 if (IS_ERR(op_data))
2360 RETURN(PTR_ERR(op_data));
2362 oit.it_create_mode |= M_CHECK_STALE;
2363 rc = md_intent_lock(exp, op_data, NULL, 0,
2364 /* we are not interested in name
2367 ll_md_blocking_ast, 0);
2368 ll_finish_md_op_data(op_data);
2369 oit.it_create_mode &= ~M_CHECK_STALE;
2371 rc = ll_inode_revalidate_fini(inode, rc);
2375 rc = ll_revalidate_it_finish(req, &oit, dentry);
2377 ll_intent_release(&oit);
2381 /* Unlinked? Unhash dentry, so it is not picked up later by
2382 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2383 here to preserve get_cwd functionality on 2.6.
2385 if (!dentry->d_inode->i_nlink)
2386 d_lustre_invalidate(dentry);
2388 ll_lookup_finish_locks(&oit, dentry);
2389 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2390 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2391 obd_valid valid = OBD_MD_FLGETATTR;
2392 struct md_op_data *op_data;
2395 if (S_ISREG(inode->i_mode)) {
2396 rc = ll_get_max_mdsize(sbi, &ealen);
2399 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2402 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2403 0, ealen, LUSTRE_OPC_ANY,
2405 if (IS_ERR(op_data))
2406 RETURN(PTR_ERR(op_data));
2408 op_data->op_valid = valid;
2409 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2410 * capa for this inode. Because we only keep capas of dirs
2412 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2413 ll_finish_md_op_data(op_data);
2415 rc = ll_inode_revalidate_fini(inode, rc);
2419 rc = ll_prep_inode(&inode, req, NULL);
2422 ptlrpc_req_finished(req);
2426 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2429 struct inode *inode = dentry->d_inode;
2433 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2435 /* if object not yet allocated, don't validate size */
2436 if (rc == 0 && !ll_i2info(dentry->d_inode)->lli_has_smd) {
2437 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2438 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2439 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2443 /* ll_glimpse_size will prefer locally cached writes if they extend
2447 rc = ll_glimpse_size(inode);
2452 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2453 struct lookup_intent *it, struct kstat *stat)
2455 struct inode *inode = de->d_inode;
2456 struct ll_sb_info *sbi = ll_i2sbi(inode);
2457 struct ll_inode_info *lli = ll_i2info(inode);
2460 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2461 MDS_INODELOCK_LOOKUP);
2462 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2467 stat->dev = inode->i_sb->s_dev;
2468 if (ll_need_32bit_api(sbi))
2469 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2471 stat->ino = inode->i_ino;
2472 stat->mode = inode->i_mode;
2473 stat->nlink = inode->i_nlink;
2474 stat->uid = inode->i_uid;
2475 stat->gid = inode->i_gid;
2476 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2477 stat->atime = inode->i_atime;
2478 stat->mtime = inode->i_mtime;
2479 stat->ctime = inode->i_ctime;
2480 #ifdef HAVE_INODE_BLKSIZE
2481 stat->blksize = inode->i_blksize;
2483 stat->blksize = 1 << inode->i_blkbits;
2486 stat->size = i_size_read(inode);
2487 stat->blocks = inode->i_blocks;
2491 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2493 struct lookup_intent it = { .it_op = IT_GETATTR };
2495 return ll_getattr_it(mnt, de, &it, stat);
2498 #ifdef HAVE_LINUX_FIEMAP_H
2499 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2500 __u64 start, __u64 len)
2504 struct ll_user_fiemap *fiemap;
2505 unsigned int extent_count = fieinfo->fi_extents_max;
2507 num_bytes = sizeof(*fiemap) + (extent_count *
2508 sizeof(struct ll_fiemap_extent));
2509 OBD_ALLOC_LARGE(fiemap, num_bytes);
2514 fiemap->fm_flags = fieinfo->fi_flags;
2515 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2516 fiemap->fm_start = start;
2517 fiemap->fm_length = len;
2518 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2519 sizeof(struct ll_fiemap_extent));
2521 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2523 fieinfo->fi_flags = fiemap->fm_flags;
2524 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2525 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2526 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2528 OBD_FREE_LARGE(fiemap, num_bytes);
2535 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2536 lustre_check_acl(struct inode *inode, int mask, unsigned int flags)
2538 lustre_check_acl(struct inode *inode, int mask)
2541 #ifdef CONFIG_FS_POSIX_ACL
2542 struct ll_inode_info *lli = ll_i2info(inode);
2543 struct posix_acl *acl;
2547 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2548 if (flags & IPERM_FLAG_RCU)
2551 cfs_spin_lock(&lli->lli_lock);
2552 acl = posix_acl_dup(lli->lli_posix_acl);
2553 cfs_spin_unlock(&lli->lli_lock);
2558 rc = posix_acl_permission(inode, acl, mask);
2559 posix_acl_release(acl);
2567 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2568 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2570 # ifdef HAVE_INODE_PERMISION_2ARGS
2571 int ll_inode_permission(struct inode *inode, int mask)
2573 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2580 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2581 if (flags & IPERM_FLAG_RCU)
2585 /* as root inode are NOT getting validated in lookup operation,
2586 * need to do it before permission check. */
2588 if (inode == inode->i_sb->s_root->d_inode) {
2589 struct lookup_intent it = { .it_op = IT_LOOKUP };
2591 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2592 MDS_INODELOCK_LOOKUP);
2597 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2598 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2600 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2601 return lustre_check_remote_perm(inode, mask);
2603 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2604 rc = ll_generic_permission(inode, mask, flags, lustre_check_acl);
2609 #ifdef HAVE_FILE_READV
2610 #define READ_METHOD readv
2611 #define READ_FUNCTION ll_file_readv
2612 #define WRITE_METHOD writev
2613 #define WRITE_FUNCTION ll_file_writev
2615 #define READ_METHOD aio_read
2616 #define READ_FUNCTION ll_file_aio_read
2617 #define WRITE_METHOD aio_write
2618 #define WRITE_FUNCTION ll_file_aio_write
2621 /* -o localflock - only provides locally consistent flock locks */
2622 struct file_operations ll_file_operations = {
2623 .read = ll_file_read,
2624 .READ_METHOD = READ_FUNCTION,
2625 .write = ll_file_write,
2626 .WRITE_METHOD = WRITE_FUNCTION,
2627 .unlocked_ioctl = ll_file_ioctl,
2628 .open = ll_file_open,
2629 .release = ll_file_release,
2630 .mmap = ll_file_mmap,
2631 .llseek = ll_file_seek,
2632 #ifdef HAVE_KERNEL_SENDFILE
2633 .sendfile = ll_file_sendfile,
2635 #ifdef HAVE_KERNEL_SPLICE_READ
2636 .splice_read = ll_file_splice_read,
2642 struct file_operations ll_file_operations_flock = {
2643 .read = ll_file_read,
2644 .READ_METHOD = READ_FUNCTION,
2645 .write = ll_file_write,
2646 .WRITE_METHOD = WRITE_FUNCTION,
2647 .unlocked_ioctl = ll_file_ioctl,
2648 .open = ll_file_open,
2649 .release = ll_file_release,
2650 .mmap = ll_file_mmap,
2651 .llseek = ll_file_seek,
2652 #ifdef HAVE_KERNEL_SENDFILE
2653 .sendfile = ll_file_sendfile,
2655 #ifdef HAVE_KERNEL_SPLICE_READ
2656 .splice_read = ll_file_splice_read,
2660 .flock = ll_file_flock,
2661 .lock = ll_file_flock
2664 /* These are for -o noflock - to return ENOSYS on flock calls */
2665 struct file_operations ll_file_operations_noflock = {
2666 .read = ll_file_read,
2667 .READ_METHOD = READ_FUNCTION,
2668 .write = ll_file_write,
2669 .WRITE_METHOD = WRITE_FUNCTION,
2670 .unlocked_ioctl = ll_file_ioctl,
2671 .open = ll_file_open,
2672 .release = ll_file_release,
2673 .mmap = ll_file_mmap,
2674 .llseek = ll_file_seek,
2675 #ifdef HAVE_KERNEL_SENDFILE
2676 .sendfile = ll_file_sendfile,
2678 #ifdef HAVE_KERNEL_SPLICE_READ
2679 .splice_read = ll_file_splice_read,
2683 .flock = ll_file_noflock,
2684 .lock = ll_file_noflock
2687 struct inode_operations ll_file_inode_operations = {
2688 .setattr = ll_setattr,
2689 .truncate = ll_truncate,
2690 .getattr = ll_getattr,
2691 .permission = ll_inode_permission,
2692 .setxattr = ll_setxattr,
2693 .getxattr = ll_getxattr,
2694 .listxattr = ll_listxattr,
2695 .removexattr = ll_removexattr,
2696 #ifdef HAVE_LINUX_FIEMAP_H
2697 .fiemap = ll_fiemap,
2701 /* dynamic ioctl number support routins */
2702 static struct llioc_ctl_data {
2703 cfs_rw_semaphore_t ioc_sem;
2704 cfs_list_t ioc_head;
2706 __RWSEM_INITIALIZER(llioc.ioc_sem),
2707 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2712 cfs_list_t iocd_list;
2713 unsigned int iocd_size;
2714 llioc_callback_t iocd_cb;
2715 unsigned int iocd_count;
2716 unsigned int iocd_cmd[0];
2719 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2722 struct llioc_data *in_data = NULL;
2725 if (cb == NULL || cmd == NULL ||
2726 count > LLIOC_MAX_CMD || count < 0)
2729 size = sizeof(*in_data) + count * sizeof(unsigned int);
2730 OBD_ALLOC(in_data, size);
2731 if (in_data == NULL)
2734 memset(in_data, 0, sizeof(*in_data));
2735 in_data->iocd_size = size;
2736 in_data->iocd_cb = cb;
2737 in_data->iocd_count = count;
2738 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2740 cfs_down_write(&llioc.ioc_sem);
2741 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2742 cfs_up_write(&llioc.ioc_sem);
2747 void ll_iocontrol_unregister(void *magic)
2749 struct llioc_data *tmp;
2754 cfs_down_write(&llioc.ioc_sem);
2755 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2757 unsigned int size = tmp->iocd_size;
2759 cfs_list_del(&tmp->iocd_list);
2760 cfs_up_write(&llioc.ioc_sem);
2762 OBD_FREE(tmp, size);
2766 cfs_up_write(&llioc.ioc_sem);
2768 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2771 EXPORT_SYMBOL(ll_iocontrol_register);
2772 EXPORT_SYMBOL(ll_iocontrol_unregister);
2774 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2775 unsigned int cmd, unsigned long arg, int *rcp)
2777 enum llioc_iter ret = LLIOC_CONT;
2778 struct llioc_data *data;
2779 int rc = -EINVAL, i;
2781 cfs_down_read(&llioc.ioc_sem);
2782 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2783 for (i = 0; i < data->iocd_count; i++) {
2784 if (cmd != data->iocd_cmd[i])
2787 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2791 if (ret == LLIOC_STOP)
2794 cfs_up_read(&llioc.ioc_sem);
2801 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
2803 struct ll_inode_info *lli = ll_i2info(inode);
2804 struct cl_env_nest nest;
2809 if (lli->lli_clob == NULL)
2812 env = cl_env_nested_get(&nest);
2814 RETURN(PTR_ERR(env));
2816 result = cl_conf_set(env, lli->lli_clob, conf);
2817 cl_env_nested_put(&nest, env);
2822 * This function checks if there exists a LAYOUT lock on the client side,
2823 * or enqueues it if it doesn't have one in cache.
2825 * This function will not hold layout lock so it may be revoked any time after
2826 * this function returns. Any operations depend on layout should be redone
2829 * This function should be called before lov_io_init() to get an uptodate
2830 * layout version, the caller should save the version number and after IO
2831 * is finished, this function should be called again to verify that layout
2832 * is not changed during IO time.
2834 int ll_layout_refresh(struct inode *inode, __u32 *gen)
2836 struct ll_inode_info *lli = ll_i2info(inode);
2837 struct ll_sb_info *sbi = ll_i2sbi(inode);
2838 struct md_op_data *op_data = NULL;
2839 struct ptlrpc_request *req = NULL;
2840 struct lookup_intent it = { .it_op = IT_LAYOUT };
2841 struct lustre_handle lockh;
2843 struct cl_object_conf conf = { .coc_inode = inode,
2844 .coc_validate_only = true };
2849 if (!(ll_i2sbi(inode)->ll_flags & LL_SBI_LAYOUT_LOCK))
2853 LASSERT(fid_is_sane(ll_inode2fid(inode)));
2854 LASSERT(S_ISREG(inode->i_mode));
2856 /* mostly layout lock is caching on the local side, so try to match
2857 * it before grabbing layout lock mutex. */
2858 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh);
2859 if (mode != 0) { /* hit cached lock */
2860 struct lov_stripe_md *lsm;
2862 lsm = ccc_inode_lsm_get(inode);
2864 *gen = lsm->lsm_layout_gen;
2865 ccc_inode_lsm_put(inode, lsm);
2866 ldlm_lock_decref(&lockh, mode);
2871 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
2872 0, 0, LUSTRE_OPC_ANY, NULL);
2873 if (IS_ERR(op_data))
2874 RETURN(PTR_ERR(op_data));
2876 /* take layout lock mutex to enqueue layout lock exclusively. */
2877 cfs_mutex_lock(&lli->lli_layout_mutex);
2879 /* make sure the old conf goes away */
2880 ll_layout_conf(inode, &conf);
2882 /* enqueue layout lock */
2883 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0,
2884 &req, ll_md_blocking_ast, 0);
2886 /* we get a new lock, so update the lock data */
2887 lockh.cookie = it.d.lustre.it_lock_handle;
2888 md_set_lock_data(sbi->ll_md_exp, &lockh.cookie, inode, NULL);
2890 /* req == NULL is when lock was found in client cache, without
2891 * any request to server (but lsm can be canceled just after a
2894 struct ldlm_lock *lock = ldlm_handle2lock(&lockh);
2895 struct lustre_md md = { NULL };
2899 /* for IT_LAYOUT lock, lmm is returned in lock's lvb
2900 * data via completion callback */
2901 LASSERT(lock != NULL);
2902 lmm = lock->l_lvb_data;
2903 lmmsize = lock->l_lvb_len;
2905 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
2909 *gen = md.lsm->lsm_layout_gen;
2911 memset(&conf, 0, sizeof conf);
2912 conf.coc_inode = inode;
2913 conf.u.coc_md = &md;
2914 ll_layout_conf(inode, &conf);
2916 lli->lli_has_smd = md.lsm != NULL;
2919 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
2921 LDLM_LOCK_PUT(lock);
2922 ptlrpc_req_finished(req);
2923 } else { /* hit caching lock */
2924 struct lov_stripe_md *lsm;
2926 lsm = ccc_inode_lsm_get(inode);
2928 *gen = lsm->lsm_layout_gen;
2929 ccc_inode_lsm_put(inode, lsm);
2931 ll_intent_drop_lock(&it);
2933 cfs_mutex_unlock(&lli->lli_layout_mutex);
2934 ll_finish_md_op_data(op_data);