1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
32 * Copyright (c) 2011, 2012, Whamcloud, Inc.
35 * This file is part of Lustre, http://www.lustre.org/
36 * Lustre is a trademark of Sun Microsystems, Inc.
40 * Author: Peter Braam <braam@clusterfs.com>
41 * Author: Phil Schwan <phil@clusterfs.com>
42 * Author: Andreas Dilger <adilger@clusterfs.com>
45 #define DEBUG_SUBSYSTEM S_LLITE
46 #include <lustre_dlm.h>
47 #include <lustre_lite.h>
48 #include <linux/pagemap.h>
49 #include <linux/file.h>
50 #include "llite_internal.h"
51 #include <lustre/ll_fiemap.h>
53 #include "cl_object.h"
55 struct ll_file_data *ll_file_data_get(void)
57 struct ll_file_data *fd;
59 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
63 static void ll_file_data_put(struct ll_file_data *fd)
66 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
69 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
70 struct lustre_handle *fh)
72 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
73 op_data->op_attr.ia_mode = inode->i_mode;
74 op_data->op_attr.ia_atime = inode->i_atime;
75 op_data->op_attr.ia_mtime = inode->i_mtime;
76 op_data->op_attr.ia_ctime = inode->i_ctime;
77 op_data->op_attr.ia_size = i_size_read(inode);
78 op_data->op_attr_blocks = inode->i_blocks;
79 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
80 ll_inode_to_ext_flags(inode->i_flags);
81 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
83 op_data->op_handle = *fh;
84 op_data->op_capa1 = ll_mdscapa_get(inode);
88 * Closes the IO epoch and packs all the attributes into @op_data for
91 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
92 struct obd_client_handle *och)
96 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
97 ATTR_MTIME_SET | ATTR_CTIME_SET;
99 if (!(och->och_flags & FMODE_WRITE))
102 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
103 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
105 ll_ioepoch_close(inode, op_data, &och, 0);
108 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
109 ll_prep_md_op_data(op_data, inode, NULL, NULL,
110 0, 0, LUSTRE_OPC_ANY, NULL);
114 static int ll_close_inode_openhandle(struct obd_export *md_exp,
116 struct obd_client_handle *och)
118 struct obd_export *exp = ll_i2mdexp(inode);
119 struct md_op_data *op_data;
120 struct ptlrpc_request *req = NULL;
121 struct obd_device *obd = class_exp2obd(exp);
128 * XXX: in case of LMV, is this correct to access
131 CERROR("Invalid MDC connection handle "LPX64"\n",
132 ll_i2mdexp(inode)->exp_handle.h_cookie);
136 OBD_ALLOC_PTR(op_data);
138 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
140 ll_prepare_close(inode, op_data, och);
141 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
142 rc = md_close(md_exp, op_data, och->och_mod, &req);
144 /* This close must have the epoch closed. */
145 LASSERT(epoch_close);
146 /* MDS has instructed us to obtain Size-on-MDS attribute from
147 * OSTs and send setattr to back to MDS. */
148 rc = ll_som_update(inode, op_data);
150 CERROR("inode %lu mdc Size-on-MDS update failed: "
151 "rc = %d\n", inode->i_ino, rc);
155 CERROR("inode %lu mdc close failed: rc = %d\n",
158 ll_finish_md_op_data(op_data);
161 rc = ll_objects_destroy(req, inode);
163 CERROR("inode %lu ll_objects destroy: rc = %d\n",
170 if (exp_connect_som(exp) && !epoch_close &&
171 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
172 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
174 md_clear_open_replay_data(md_exp, och);
175 /* Free @och if it is not waiting for DONE_WRITING. */
176 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
179 if (req) /* This is close request */
180 ptlrpc_req_finished(req);
184 int ll_md_real_close(struct inode *inode, int flags)
186 struct ll_inode_info *lli = ll_i2info(inode);
187 struct obd_client_handle **och_p;
188 struct obd_client_handle *och;
193 if (flags & FMODE_WRITE) {
194 och_p = &lli->lli_mds_write_och;
195 och_usecount = &lli->lli_open_fd_write_count;
196 } else if (flags & FMODE_EXEC) {
197 och_p = &lli->lli_mds_exec_och;
198 och_usecount = &lli->lli_open_fd_exec_count;
200 LASSERT(flags & FMODE_READ);
201 och_p = &lli->lli_mds_read_och;
202 och_usecount = &lli->lli_open_fd_read_count;
205 cfs_mutex_lock(&lli->lli_och_mutex);
206 if (*och_usecount) { /* There are still users of this handle, so
208 cfs_mutex_unlock(&lli->lli_och_mutex);
213 cfs_mutex_unlock(&lli->lli_och_mutex);
215 if (och) { /* There might be a race and somebody have freed this och
217 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
224 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
227 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
228 struct ll_inode_info *lli = ll_i2info(inode);
232 /* clear group lock, if present */
233 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
234 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
236 /* Let's see if we have good enough OPEN lock on the file and if
237 we can skip talking to MDS */
238 if (file->f_dentry->d_inode) { /* Can this ever be false? */
240 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
241 struct lustre_handle lockh;
242 struct inode *inode = file->f_dentry->d_inode;
243 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
245 cfs_mutex_lock(&lli->lli_och_mutex);
246 if (fd->fd_omode & FMODE_WRITE) {
248 LASSERT(lli->lli_open_fd_write_count);
249 lli->lli_open_fd_write_count--;
250 } else if (fd->fd_omode & FMODE_EXEC) {
252 LASSERT(lli->lli_open_fd_exec_count);
253 lli->lli_open_fd_exec_count--;
256 LASSERT(lli->lli_open_fd_read_count);
257 lli->lli_open_fd_read_count--;
259 cfs_mutex_unlock(&lli->lli_och_mutex);
261 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
262 LDLM_IBITS, &policy, lockmode,
264 rc = ll_md_real_close(file->f_dentry->d_inode,
268 CERROR("Releasing a file %p with negative dentry %p. Name %s",
269 file, file->f_dentry, file->f_dentry->d_name.name);
272 LUSTRE_FPRIVATE(file) = NULL;
273 ll_file_data_put(fd);
274 ll_capa_close(inode);
279 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
281 /* While this returns an error code, fput() the caller does not, so we need
282 * to make every effort to clean up all of our state here. Also, applications
283 * rarely check close errors and even if an error is returned they will not
284 * re-try the close call.
286 int ll_file_release(struct inode *inode, struct file *file)
288 struct ll_file_data *fd;
289 struct ll_sb_info *sbi = ll_i2sbi(inode);
290 struct ll_inode_info *lli = ll_i2info(inode);
291 struct lov_stripe_md *lsm = lli->lli_smd;
295 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
296 inode->i_generation, inode);
298 #ifdef CONFIG_FS_POSIX_ACL
299 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
300 inode == inode->i_sb->s_root->d_inode) {
301 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
304 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
305 fd->fd_flags &= ~LL_FILE_RMTACL;
306 rct_del(&sbi->ll_rct, cfs_curproc_pid());
307 et_search_free(&sbi->ll_et, cfs_curproc_pid());
312 if (inode->i_sb->s_root != file->f_dentry)
313 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
314 fd = LUSTRE_FPRIVATE(file);
317 /* The last ref on @file, maybe not the the owner pid of statahead.
318 * Different processes can open the same dir, "ll_opendir_key" means:
319 * it is me that should stop the statahead thread. */
320 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
321 lli->lli_opendir_pid != 0)
322 ll_stop_statahead(inode, lli->lli_opendir_key);
324 if (inode->i_sb->s_root == file->f_dentry) {
325 LUSTRE_FPRIVATE(file) = NULL;
326 ll_file_data_put(fd);
330 if (!S_ISDIR(inode->i_mode)) {
332 lov_test_and_clear_async_rc(lsm);
333 lli->lli_async_rc = 0;
336 rc = ll_md_close(sbi->ll_md_exp, inode, file);
338 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
339 libcfs_debug_dumplog();
344 static int ll_intent_file_open(struct file *file, void *lmm,
345 int lmmsize, struct lookup_intent *itp)
347 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
348 struct dentry *parent = file->f_dentry->d_parent;
349 const char *name = file->f_dentry->d_name.name;
350 const int len = file->f_dentry->d_name.len;
351 struct md_op_data *op_data;
352 struct ptlrpc_request *req;
353 __u32 opc = LUSTRE_OPC_ANY;
360 /* Usually we come here only for NFSD, and we want open lock.
361 But we can also get here with pre 2.6.15 patchless kernels, and in
362 that case that lock is also ok */
363 /* We can also get here if there was cached open handle in revalidate_it
364 * but it disappeared while we were getting from there to ll_file_open.
365 * But this means this file was closed and immediatelly opened which
366 * makes a good candidate for using OPEN lock */
367 /* If lmmsize & lmm are not 0, we are just setting stripe info
368 * parameters. No need for the open lock */
369 if (lmm == NULL && lmmsize == 0) {
370 itp->it_flags |= MDS_OPEN_LOCK;
371 if (itp->it_flags & FMODE_WRITE)
372 opc = LUSTRE_OPC_CREATE;
375 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
376 file->f_dentry->d_inode, name, len,
379 RETURN(PTR_ERR(op_data));
381 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
382 0 /*unused */, &req, ll_md_blocking_ast, 0);
383 ll_finish_md_op_data(op_data);
385 /* reason for keep own exit path - don`t flood log
386 * with messages with -ESTALE errors.
388 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
389 it_open_error(DISP_OPEN_OPEN, itp))
391 ll_release_openhandle(file->f_dentry, itp);
395 if (it_disposition(itp, DISP_LOOKUP_NEG))
396 GOTO(out, rc = -ENOENT);
398 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
399 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
400 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
404 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
405 if (!rc && itp->d.lustre.it_lock_mode)
406 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
410 ptlrpc_req_finished(itp->d.lustre.it_data);
411 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
412 ll_intent_drop_lock(itp);
418 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
419 * not believe attributes if a few ioepoch holders exist. Attributes for
420 * previous ioepoch if new one is opened are also skipped by MDS.
422 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
424 if (ioepoch && lli->lli_ioepoch != ioepoch) {
425 lli->lli_ioepoch = ioepoch;
426 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
427 ioepoch, PFID(&lli->lli_fid));
431 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
432 struct lookup_intent *it, struct obd_client_handle *och)
434 struct ptlrpc_request *req = it->d.lustre.it_data;
435 struct mdt_body *body;
439 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
440 LASSERT(body != NULL); /* reply already checked out */
442 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
443 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
444 och->och_fid = lli->lli_fid;
445 och->och_flags = it->it_flags;
446 ll_ioepoch_open(lli, body->ioepoch);
448 return md_set_open_replay_data(md_exp, och, req);
451 int ll_local_open(struct file *file, struct lookup_intent *it,
452 struct ll_file_data *fd, struct obd_client_handle *och)
454 struct inode *inode = file->f_dentry->d_inode;
455 struct ll_inode_info *lli = ll_i2info(inode);
458 LASSERT(!LUSTRE_FPRIVATE(file));
463 struct ptlrpc_request *req = it->d.lustre.it_data;
464 struct mdt_body *body;
467 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
471 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
472 if ((it->it_flags & FMODE_WRITE) &&
473 (body->valid & OBD_MD_FLSIZE))
474 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
475 lli->lli_ioepoch, PFID(&lli->lli_fid));
478 LUSTRE_FPRIVATE(file) = fd;
479 ll_readahead_init(inode, &fd->fd_ras);
480 fd->fd_omode = it->it_flags;
484 /* Open a file, and (for the very first open) create objects on the OSTs at
485 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
486 * creation or open until ll_lov_setstripe() ioctl is called.
488 * If we already have the stripe MD locally then we don't request it in
489 * md_open(), by passing a lmm_size = 0.
491 * It is up to the application to ensure no other processes open this file
492 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
493 * used. We might be able to avoid races of that sort by getting lli_open_sem
494 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
495 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
497 int ll_file_open(struct inode *inode, struct file *file)
499 struct ll_inode_info *lli = ll_i2info(inode);
500 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
501 .it_flags = file->f_flags };
502 struct lov_stripe_md *lsm;
503 struct obd_client_handle **och_p = NULL;
504 __u64 *och_usecount = NULL;
505 struct ll_file_data *fd;
506 int rc = 0, opendir_set = 0;
509 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
510 inode->i_generation, inode, file->f_flags);
512 it = file->private_data; /* XXX: compat macro */
513 file->private_data = NULL; /* prevent ll_local_open assertion */
515 fd = ll_file_data_get();
517 GOTO(out_och_free, rc = -ENOMEM);
520 if (S_ISDIR(inode->i_mode)) {
521 cfs_spin_lock(&lli->lli_sa_lock);
522 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
523 lli->lli_opendir_pid == 0) {
524 lli->lli_opendir_key = fd;
525 lli->lli_opendir_pid = cfs_curproc_pid();
528 cfs_spin_unlock(&lli->lli_sa_lock);
531 if (inode->i_sb->s_root == file->f_dentry) {
532 LUSTRE_FPRIVATE(file) = fd;
536 if (!it || !it->d.lustre.it_disposition) {
537 /* Convert f_flags into access mode. We cannot use file->f_mode,
538 * because everything but O_ACCMODE mask was stripped from
540 if ((oit.it_flags + 1) & O_ACCMODE)
542 if (file->f_flags & O_TRUNC)
543 oit.it_flags |= FMODE_WRITE;
545 /* kernel only call f_op->open in dentry_open. filp_open calls
546 * dentry_open after call to open_namei that checks permissions.
547 * Only nfsd_open call dentry_open directly without checking
548 * permissions and because of that this code below is safe. */
549 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
550 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
552 /* We do not want O_EXCL here, presumably we opened the file
553 * already? XXX - NFS implications? */
554 oit.it_flags &= ~O_EXCL;
556 /* bug20584, if "it_flags" contains O_CREAT, the file will be
557 * created if necessary, then "IT_CREAT" should be set to keep
558 * consistent with it */
559 if (oit.it_flags & O_CREAT)
560 oit.it_op |= IT_CREAT;
566 /* Let's see if we have file open on MDS already. */
567 if (it->it_flags & FMODE_WRITE) {
568 och_p = &lli->lli_mds_write_och;
569 och_usecount = &lli->lli_open_fd_write_count;
570 } else if (it->it_flags & FMODE_EXEC) {
571 och_p = &lli->lli_mds_exec_och;
572 och_usecount = &lli->lli_open_fd_exec_count;
574 och_p = &lli->lli_mds_read_och;
575 och_usecount = &lli->lli_open_fd_read_count;
578 cfs_mutex_lock(&lli->lli_och_mutex);
579 if (*och_p) { /* Open handle is present */
580 if (it_disposition(it, DISP_OPEN_OPEN)) {
581 /* Well, there's extra open request that we do not need,
582 let's close it somehow. This will decref request. */
583 rc = it_open_error(DISP_OPEN_OPEN, it);
585 cfs_mutex_unlock(&lli->lli_och_mutex);
586 GOTO(out_openerr, rc);
589 ll_release_openhandle(file->f_dentry, it);
590 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
595 rc = ll_local_open(file, it, fd, NULL);
598 cfs_mutex_unlock(&lli->lli_och_mutex);
599 GOTO(out_openerr, rc);
602 LASSERT(*och_usecount == 0);
603 if (!it->d.lustre.it_disposition) {
604 /* We cannot just request lock handle now, new ELC code
605 means that one of other OPEN locks for this file
606 could be cancelled, and since blocking ast handler
607 would attempt to grab och_mutex as well, that would
608 result in a deadlock */
609 cfs_mutex_unlock(&lli->lli_och_mutex);
610 it->it_create_mode |= M_CHECK_STALE;
611 rc = ll_intent_file_open(file, NULL, 0, it);
612 it->it_create_mode &= ~M_CHECK_STALE;
614 GOTO(out_openerr, rc);
618 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
620 GOTO(out_och_free, rc = -ENOMEM);
624 /* md_intent_lock() didn't get a request ref if there was an
625 * open error, so don't do cleanup on the request here
627 /* XXX (green): Should not we bail out on any error here, not
628 * just open error? */
629 rc = it_open_error(DISP_OPEN_OPEN, it);
631 GOTO(out_och_free, rc);
633 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
635 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
636 rc = ll_local_open(file, it, fd, *och_p);
638 GOTO(out_och_free, rc);
640 cfs_mutex_unlock(&lli->lli_och_mutex);
643 /* Must do this outside lli_och_mutex lock to prevent deadlock where
644 different kind of OPEN lock for this same inode gets cancelled
645 by ldlm_cancel_lru */
646 if (!S_ISREG(inode->i_mode))
647 GOTO(out_och_free, rc);
653 if (file->f_flags & O_LOV_DELAY_CREATE ||
654 !(file->f_mode & FMODE_WRITE)) {
655 CDEBUG(D_INODE, "object creation was delayed\n");
656 GOTO(out_och_free, rc);
659 file->f_flags &= ~O_LOV_DELAY_CREATE;
660 GOTO(out_och_free, rc);
663 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
664 ptlrpc_req_finished(it->d.lustre.it_data);
665 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
669 if (och_p && *och_p) {
670 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
671 *och_p = NULL; /* OBD_FREE writes some magic there */
674 cfs_mutex_unlock(&lli->lli_och_mutex);
677 if (opendir_set != 0)
678 ll_stop_statahead(inode, lli->lli_opendir_key);
680 ll_file_data_put(fd);
686 /* Fills the obdo with the attributes for the lsm */
687 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
688 struct obd_capa *capa, struct obdo *obdo,
689 __u64 ioepoch, int sync)
691 struct ptlrpc_request_set *set;
692 struct obd_info oinfo = { { { 0 } } };
697 LASSERT(lsm != NULL);
701 oinfo.oi_oa->o_id = lsm->lsm_object_id;
702 oinfo.oi_oa->o_seq = lsm->lsm_object_seq;
703 oinfo.oi_oa->o_mode = S_IFREG;
704 oinfo.oi_oa->o_ioepoch = ioepoch;
705 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
706 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
707 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
708 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
709 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
710 OBD_MD_FLDATAVERSION;
711 oinfo.oi_capa = capa;
713 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
714 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
717 set = ptlrpc_prep_set();
719 CERROR("can't allocate ptlrpc set\n");
722 rc = obd_getattr_async(exp, &oinfo, set);
724 rc = ptlrpc_set_wait(set);
725 ptlrpc_set_destroy(set);
728 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
729 OBD_MD_FLATIME | OBD_MD_FLMTIME |
730 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
731 OBD_MD_FLDATAVERSION);
736 * Performs the getattr on the inode and updates its fields.
737 * If @sync != 0, perform the getattr under the server-side lock.
739 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
740 __u64 ioepoch, int sync)
742 struct ll_inode_info *lli = ll_i2info(inode);
743 struct obd_capa *capa = ll_mdscapa_get(inode);
747 rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode),
748 capa, obdo, ioepoch, sync);
751 obdo_refresh_inode(inode, obdo, obdo->o_valid);
753 "objid "LPX64" size %llu, blocks %llu, blksize %lu\n",
754 lli->lli_smd->lsm_object_id, i_size_read(inode),
755 (unsigned long long)inode->i_blocks,
756 (unsigned long)ll_inode_blksize(inode));
761 int ll_merge_lvb(struct inode *inode)
763 struct ll_inode_info *lli = ll_i2info(inode);
764 struct ll_sb_info *sbi = ll_i2sbi(inode);
770 ll_inode_size_lock(inode, 1);
771 inode_init_lvb(inode, &lvb);
773 /* merge timestamps the most resently obtained from mds with
774 timestamps obtained from osts */
775 lvb.lvb_atime = lli->lli_lvb.lvb_atime;
776 lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
777 lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
778 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
779 cl_isize_write_nolock(inode, lvb.lvb_size);
781 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
782 PFID(&lli->lli_fid), lvb.lvb_size);
783 inode->i_blocks = lvb.lvb_blocks;
785 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
786 LTIME_S(inode->i_atime) = lvb.lvb_atime;
787 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
788 ll_inode_size_unlock(inode, 1);
793 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
796 struct obdo obdo = { 0 };
799 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
801 st->st_size = obdo.o_size;
802 st->st_blocks = obdo.o_blocks;
803 st->st_mtime = obdo.o_mtime;
804 st->st_atime = obdo.o_atime;
805 st->st_ctime = obdo.o_ctime;
810 void ll_io_init(struct cl_io *io, const struct file *file, int write)
812 struct inode *inode = file->f_dentry->d_inode;
814 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
816 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
817 io->ci_obj = ll_i2info(inode)->lli_clob;
818 io->ci_lockreq = CILR_MAYBE;
819 if (ll_file_nolock(file)) {
820 io->ci_lockreq = CILR_NEVER;
821 io->ci_no_srvlock = 1;
822 } else if (file->f_flags & O_APPEND) {
823 io->ci_lockreq = CILR_MANDATORY;
827 static ssize_t ll_file_io_generic(const struct lu_env *env,
828 struct vvp_io_args *args, struct file *file,
829 enum cl_io_type iot, loff_t *ppos, size_t count)
831 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
836 io = ccc_env_thread_io(env);
837 ll_io_init(io, file, iot == CIT_WRITE);
839 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
840 struct vvp_io *vio = vvp_env_io(env);
841 struct ccc_io *cio = ccc_env_io(env);
842 int write_mutex_locked = 0;
844 cio->cui_fd = LUSTRE_FPRIVATE(file);
845 vio->cui_io_subtype = args->via_io_subtype;
847 switch (vio->cui_io_subtype) {
849 cio->cui_iov = args->u.normal.via_iov;
850 cio->cui_nrsegs = args->u.normal.via_nrsegs;
851 cio->cui_tot_nrsegs = cio->cui_nrsegs;
852 #ifndef HAVE_FILE_WRITEV
853 cio->cui_iocb = args->u.normal.via_iocb;
855 if ((iot == CIT_WRITE) &&
856 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
857 if (cfs_mutex_lock_interruptible(&lli->
859 GOTO(out, result = -ERESTARTSYS);
860 write_mutex_locked = 1;
861 } else if (iot == CIT_READ) {
862 cfs_down_read(&lli->lli_trunc_sem);
866 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
867 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
870 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
871 vio->u.splice.cui_flags = args->u.splice.via_flags;
874 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
877 result = cl_io_loop(env, io);
878 if (write_mutex_locked)
879 cfs_mutex_unlock(&lli->lli_write_mutex);
880 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
881 cfs_up_read(&lli->lli_trunc_sem);
883 /* cl_io_rw_init() handled IO */
884 result = io->ci_result;
887 if (io->ci_nob > 0) {
889 *ppos = io->u.ci_wr.wr.crw_pos;
895 if (iot == CIT_READ) {
897 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
898 LPROC_LL_READ_BYTES, result);
899 } else if (iot == CIT_WRITE) {
901 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
902 LPROC_LL_WRITE_BYTES, result);
903 lli->lli_write_rc = 0;
905 lli->lli_write_rc = result;
914 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
916 static int ll_file_get_iov_count(const struct iovec *iov,
917 unsigned long *nr_segs, size_t *count)
922 for (seg = 0; seg < *nr_segs; seg++) {
923 const struct iovec *iv = &iov[seg];
926 * If any segment has a negative length, or the cumulative
927 * length ever wraps negative then return -EINVAL.
930 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
932 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
937 cnt -= iv->iov_len; /* This segment is no good */
944 #ifdef HAVE_FILE_READV
945 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
946 unsigned long nr_segs, loff_t *ppos)
949 struct vvp_io_args *args;
955 result = ll_file_get_iov_count(iov, &nr_segs, &count);
959 env = cl_env_get(&refcheck);
961 RETURN(PTR_ERR(env));
963 args = vvp_env_args(env, IO_NORMAL);
964 args->u.normal.via_iov = (struct iovec *)iov;
965 args->u.normal.via_nrsegs = nr_segs;
967 result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
968 cl_env_put(env, &refcheck);
972 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
976 struct iovec *local_iov;
981 env = cl_env_get(&refcheck);
983 RETURN(PTR_ERR(env));
985 local_iov = &vvp_env_info(env)->vti_local_iov;
986 local_iov->iov_base = (void __user *)buf;
987 local_iov->iov_len = count;
988 result = ll_file_readv(file, local_iov, 1, ppos);
989 cl_env_put(env, &refcheck);
994 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
995 unsigned long nr_segs, loff_t pos)
998 struct vvp_io_args *args;
1004 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1008 env = cl_env_get(&refcheck);
1010 RETURN(PTR_ERR(env));
1012 args = vvp_env_args(env, IO_NORMAL);
1013 args->u.normal.via_iov = (struct iovec *)iov;
1014 args->u.normal.via_nrsegs = nr_segs;
1015 args->u.normal.via_iocb = iocb;
1017 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1018 &iocb->ki_pos, count);
1019 cl_env_put(env, &refcheck);
1023 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1027 struct iovec *local_iov;
1028 struct kiocb *kiocb;
1033 env = cl_env_get(&refcheck);
1035 RETURN(PTR_ERR(env));
1037 local_iov = &vvp_env_info(env)->vti_local_iov;
1038 kiocb = &vvp_env_info(env)->vti_kiocb;
1039 local_iov->iov_base = (void __user *)buf;
1040 local_iov->iov_len = count;
1041 init_sync_kiocb(kiocb, file);
1042 kiocb->ki_pos = *ppos;
1043 kiocb->ki_left = count;
1045 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1046 *ppos = kiocb->ki_pos;
1048 cl_env_put(env, &refcheck);
1054 * Write to a file (through the page cache).
1056 #ifdef HAVE_FILE_WRITEV
1057 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1058 unsigned long nr_segs, loff_t *ppos)
1061 struct vvp_io_args *args;
1067 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1071 env = cl_env_get(&refcheck);
1073 RETURN(PTR_ERR(env));
1075 args = vvp_env_args(env, IO_NORMAL);
1076 args->u.normal.via_iov = (struct iovec *)iov;
1077 args->u.normal.via_nrsegs = nr_segs;
1079 result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1080 cl_env_put(env, &refcheck);
1084 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1088 struct iovec *local_iov;
1093 env = cl_env_get(&refcheck);
1095 RETURN(PTR_ERR(env));
1097 local_iov = &vvp_env_info(env)->vti_local_iov;
1098 local_iov->iov_base = (void __user *)buf;
1099 local_iov->iov_len = count;
1101 result = ll_file_writev(file, local_iov, 1, ppos);
1102 cl_env_put(env, &refcheck);
1106 #else /* AIO stuff */
1107 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1108 unsigned long nr_segs, loff_t pos)
1111 struct vvp_io_args *args;
1117 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1121 env = cl_env_get(&refcheck);
1123 RETURN(PTR_ERR(env));
1125 args = vvp_env_args(env, IO_NORMAL);
1126 args->u.normal.via_iov = (struct iovec *)iov;
1127 args->u.normal.via_nrsegs = nr_segs;
1128 args->u.normal.via_iocb = iocb;
1130 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1131 &iocb->ki_pos, count);
1132 cl_env_put(env, &refcheck);
1136 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1140 struct iovec *local_iov;
1141 struct kiocb *kiocb;
1146 env = cl_env_get(&refcheck);
1148 RETURN(PTR_ERR(env));
1150 local_iov = &vvp_env_info(env)->vti_local_iov;
1151 kiocb = &vvp_env_info(env)->vti_kiocb;
1152 local_iov->iov_base = (void __user *)buf;
1153 local_iov->iov_len = count;
1154 init_sync_kiocb(kiocb, file);
1155 kiocb->ki_pos = *ppos;
1156 kiocb->ki_left = count;
1158 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1159 *ppos = kiocb->ki_pos;
1161 cl_env_put(env, &refcheck);
1167 #ifdef HAVE_KERNEL_SENDFILE
1169 * Send file content (through pagecache) somewhere with helper
1171 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1172 read_actor_t actor, void *target)
1175 struct vvp_io_args *args;
1180 env = cl_env_get(&refcheck);
1182 RETURN(PTR_ERR(env));
1184 args = vvp_env_args(env, IO_SENDFILE);
1185 args->u.sendfile.via_target = target;
1186 args->u.sendfile.via_actor = actor;
1188 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1189 cl_env_put(env, &refcheck);
1194 #ifdef HAVE_KERNEL_SPLICE_READ
1196 * Send file content (through pagecache) somewhere with helper
1198 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1199 struct pipe_inode_info *pipe, size_t count,
1203 struct vvp_io_args *args;
1208 env = cl_env_get(&refcheck);
1210 RETURN(PTR_ERR(env));
1212 args = vvp_env_args(env, IO_SPLICE);
1213 args->u.splice.via_pipe = pipe;
1214 args->u.splice.via_flags = flags;
1216 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1217 cl_env_put(env, &refcheck);
1222 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_seq seq,
1225 struct obd_export *exp = ll_i2dtexp(inode);
1226 struct obd_trans_info oti = { 0 };
1227 struct obdo *oa = NULL;
1230 struct lov_stripe_md *lsm, *lsm2;
1237 ll_inode_size_lock(inode, 0);
1238 lsm = ll_i2info(inode)->lli_smd;
1240 GOTO(out, rc = -ENOENT);
1241 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1242 (lsm->lsm_stripe_count));
1244 OBD_ALLOC_LARGE(lsm2, lsm_size);
1246 GOTO(out, rc = -ENOMEM);
1250 oa->o_nlink = ost_idx;
1251 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1252 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1253 obdo_from_inode(oa, inode, &ll_i2info(inode)->lli_fid, OBD_MD_FLTYPE |
1254 OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1255 memcpy(lsm2, lsm, lsm_size);
1256 rc = obd_create(exp, oa, &lsm2, &oti);
1258 OBD_FREE_LARGE(lsm2, lsm_size);
1261 ll_inode_size_unlock(inode, 0);
1266 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1268 struct ll_recreate_obj ucreat;
1271 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1274 if (cfs_copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1275 sizeof(struct ll_recreate_obj)))
1278 RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
1279 ucreat.lrc_ost_idx));
1282 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1289 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1292 if (cfs_copy_from_user(&fid, (struct lu_fid *)arg,
1293 sizeof(struct lu_fid)))
1296 id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
1297 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1298 RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
1301 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1302 int flags, struct lov_user_md *lum, int lum_size)
1304 struct lov_stripe_md *lsm;
1305 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1309 ll_inode_size_lock(inode, 0);
1310 lsm = ll_i2info(inode)->lli_smd;
1312 ll_inode_size_unlock(inode, 0);
1313 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1318 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1321 rc = oit.d.lustre.it_status;
1323 GOTO(out_req_free, rc);
1325 ll_release_openhandle(file->f_dentry, &oit);
1328 ll_inode_size_unlock(inode, 0);
1329 ll_intent_release(&oit);
1332 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1336 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1337 struct lov_mds_md **lmmp, int *lmm_size,
1338 struct ptlrpc_request **request)
1340 struct ll_sb_info *sbi = ll_i2sbi(inode);
1341 struct mdt_body *body;
1342 struct lov_mds_md *lmm = NULL;
1343 struct ptlrpc_request *req = NULL;
1344 struct md_op_data *op_data;
1347 rc = ll_get_max_mdsize(sbi, &lmmsize);
1351 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1352 strlen(filename), lmmsize,
1353 LUSTRE_OPC_ANY, NULL);
1354 if (IS_ERR(op_data))
1355 RETURN(PTR_ERR(op_data));
1357 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1358 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1359 ll_finish_md_op_data(op_data);
1361 CDEBUG(D_INFO, "md_getattr_name failed "
1362 "on %s: rc %d\n", filename, rc);
1366 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1367 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1369 lmmsize = body->eadatasize;
1371 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1373 GOTO(out, rc = -ENODATA);
1376 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1377 LASSERT(lmm != NULL);
1379 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1380 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1381 GOTO(out, rc = -EPROTO);
1385 * This is coming from the MDS, so is probably in
1386 * little endian. We convert it to host endian before
1387 * passing it to userspace.
1389 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1390 /* if function called for directory - we should
1391 * avoid swab not existent lsm objects */
1392 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1393 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1394 if (S_ISREG(body->mode))
1395 lustre_swab_lov_user_md_objects(
1396 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1397 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1398 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1399 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1400 if (S_ISREG(body->mode))
1401 lustre_swab_lov_user_md_objects(
1402 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1403 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1409 *lmm_size = lmmsize;
1414 static int ll_lov_setea(struct inode *inode, struct file *file,
1417 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1418 struct lov_user_md *lump;
1419 int lum_size = sizeof(struct lov_user_md) +
1420 sizeof(struct lov_user_ost_data);
1424 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1427 OBD_ALLOC_LARGE(lump, lum_size);
1431 if (cfs_copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1432 OBD_FREE_LARGE(lump, lum_size);
1436 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1438 OBD_FREE_LARGE(lump, lum_size);
1442 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1445 struct lov_user_md_v3 lumv3;
1446 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1447 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1448 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1451 int flags = FMODE_WRITE;
1454 /* first try with v1 which is smaller than v3 */
1455 lum_size = sizeof(struct lov_user_md_v1);
1456 if (cfs_copy_from_user(lumv1, lumv1p, lum_size))
1459 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1460 lum_size = sizeof(struct lov_user_md_v3);
1461 if (cfs_copy_from_user(&lumv3, lumv3p, lum_size))
1465 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1467 put_user(0, &lumv1p->lmm_stripe_count);
1468 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1469 0, ll_i2info(inode)->lli_smd,
1475 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1477 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1482 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1487 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1489 struct ll_inode_info *lli = ll_i2info(inode);
1490 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1491 struct ccc_grouplock grouplock;
1495 if (ll_file_nolock(file))
1496 RETURN(-EOPNOTSUPP);
1498 cfs_spin_lock(&lli->lli_lock);
1499 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1500 CWARN("group lock already existed with gid %lu\n",
1501 fd->fd_grouplock.cg_gid);
1502 cfs_spin_unlock(&lli->lli_lock);
1505 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1506 cfs_spin_unlock(&lli->lli_lock);
1508 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1509 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1513 cfs_spin_lock(&lli->lli_lock);
1514 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1515 cfs_spin_unlock(&lli->lli_lock);
1516 CERROR("another thread just won the race\n");
1517 cl_put_grouplock(&grouplock);
1521 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1522 fd->fd_grouplock = grouplock;
1523 cfs_spin_unlock(&lli->lli_lock);
1525 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1529 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1531 struct ll_inode_info *lli = ll_i2info(inode);
1532 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1533 struct ccc_grouplock grouplock;
1536 cfs_spin_lock(&lli->lli_lock);
1537 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1538 cfs_spin_unlock(&lli->lli_lock);
1539 CWARN("no group lock held\n");
1542 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1544 if (fd->fd_grouplock.cg_gid != arg) {
1545 CWARN("group lock %lu doesn't match current id %lu\n",
1546 arg, fd->fd_grouplock.cg_gid);
1547 cfs_spin_unlock(&lli->lli_lock);
1551 grouplock = fd->fd_grouplock;
1552 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1553 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1554 cfs_spin_unlock(&lli->lli_lock);
1556 cl_put_grouplock(&grouplock);
1557 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1562 * Close inode open handle
1564 * \param dentry [in] dentry which contains the inode
1565 * \param it [in,out] intent which contains open info and result
1568 * \retval <0 failure
1570 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1572 struct inode *inode = dentry->d_inode;
1573 struct obd_client_handle *och;
1579 /* Root ? Do nothing. */
1580 if (dentry->d_inode->i_sb->s_root == dentry)
1583 /* No open handle to close? Move away */
1584 if (!it_disposition(it, DISP_OPEN_OPEN))
1587 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1589 OBD_ALLOC(och, sizeof(*och));
1591 GOTO(out, rc = -ENOMEM);
1593 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1594 ll_i2info(inode), it, och);
1596 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1599 /* this one is in place of ll_file_open */
1600 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1601 ptlrpc_req_finished(it->d.lustre.it_data);
1602 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1608 * Get size for inode for which FIEMAP mapping is requested.
1609 * Make the FIEMAP get_info call and returns the result.
1611 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1614 struct obd_export *exp = ll_i2dtexp(inode);
1615 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1616 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1617 int vallen = num_bytes;
1621 /* Checks for fiemap flags */
1622 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1623 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1627 /* Check for FIEMAP_FLAG_SYNC */
1628 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1629 rc = filemap_fdatawrite(inode->i_mapping);
1634 /* If the stripe_count > 1 and the application does not understand
1635 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1637 if (lsm->lsm_stripe_count > 1 &&
1638 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1641 fm_key.oa.o_id = lsm->lsm_object_id;
1642 fm_key.oa.o_seq = lsm->lsm_object_seq;
1643 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1645 obdo_from_inode(&fm_key.oa, inode, &ll_i2info(inode)->lli_fid,
1647 /* If filesize is 0, then there would be no objects for mapping */
1648 if (fm_key.oa.o_size == 0) {
1649 fiemap->fm_mapped_extents = 0;
1653 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1655 rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1657 CERROR("obd_get_info failed: rc = %d\n", rc);
1662 int ll_fid2path(struct obd_export *exp, void *arg)
1664 struct getinfo_fid2path *gfout, *gfin;
1668 /* Need to get the buflen */
1669 OBD_ALLOC_PTR(gfin);
1672 if (cfs_copy_from_user(gfin, arg, sizeof(*gfin))) {
1677 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1678 OBD_ALLOC(gfout, outsize);
1679 if (gfout == NULL) {
1683 memcpy(gfout, gfin, sizeof(*gfout));
1686 /* Call mdc_iocontrol */
1687 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1690 if (cfs_copy_to_user(arg, gfout, outsize))
1694 OBD_FREE(gfout, outsize);
1698 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1700 struct ll_user_fiemap *fiemap_s;
1701 size_t num_bytes, ret_bytes;
1702 unsigned int extent_count;
1705 /* Get the extent count so we can calculate the size of
1706 * required fiemap buffer */
1707 if (get_user(extent_count,
1708 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1710 num_bytes = sizeof(*fiemap_s) + (extent_count *
1711 sizeof(struct ll_fiemap_extent));
1713 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1714 if (fiemap_s == NULL)
1717 /* get the fiemap value */
1718 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1720 GOTO(error, rc = -EFAULT);
1722 /* If fm_extent_count is non-zero, read the first extent since
1723 * it is used to calculate end_offset and device from previous
1726 if (copy_from_user(&fiemap_s->fm_extents[0],
1727 (char __user *)arg + sizeof(*fiemap_s),
1728 sizeof(struct ll_fiemap_extent)))
1729 GOTO(error, rc = -EFAULT);
1732 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1736 ret_bytes = sizeof(struct ll_user_fiemap);
1738 if (extent_count != 0)
1739 ret_bytes += (fiemap_s->fm_mapped_extents *
1740 sizeof(struct ll_fiemap_extent));
1742 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1746 OBD_FREE_LARGE(fiemap_s, num_bytes);
1751 * Read the data_version for inode.
1753 * This value is computed using stripe object version on OST.
1754 * Version is computed using server side locking.
1756 * @param extent_lock Take extent lock. Not needed if a process is already
1757 * holding the OST object group locks.
1759 static int ll_data_version(struct inode *inode, __u64 *data_version,
1762 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1763 struct ll_sb_info *sbi = ll_i2sbi(inode);
1764 struct obdo *obdo = NULL;
1768 /* If no stripe, we consider version is 0. */
1771 CDEBUG(D_INODE, "No object for inode\n");
1775 OBD_ALLOC_PTR(obdo);
1779 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1781 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1784 *data_version = obdo->o_data_version;
1792 #ifdef HAVE_UNLOCKED_IOCTL
1793 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1795 struct inode *inode = file->f_dentry->d_inode;
1797 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1801 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1806 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1807 inode->i_generation, inode, cmd);
1808 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1810 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1811 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1815 case LL_IOC_GETFLAGS:
1816 /* Get the current value of the file flags */
1817 return put_user(fd->fd_flags, (int *)arg);
1818 case LL_IOC_SETFLAGS:
1819 case LL_IOC_CLRFLAGS:
1820 /* Set or clear specific file flags */
1821 /* XXX This probably needs checks to ensure the flags are
1822 * not abused, and to handle any flag side effects.
1824 if (get_user(flags, (int *) arg))
1827 if (cmd == LL_IOC_SETFLAGS) {
1828 if ((flags & LL_FILE_IGNORE_LOCK) &&
1829 !(file->f_flags & O_DIRECT)) {
1830 CERROR("%s: unable to disable locking on "
1831 "non-O_DIRECT file\n", current->comm);
1835 fd->fd_flags |= flags;
1837 fd->fd_flags &= ~flags;
1840 case LL_IOC_LOV_SETSTRIPE:
1841 RETURN(ll_lov_setstripe(inode, file, arg));
1842 case LL_IOC_LOV_SETEA:
1843 RETURN(ll_lov_setea(inode, file, arg));
1844 case LL_IOC_LOV_GETSTRIPE:
1845 RETURN(ll_lov_getstripe(inode, arg));
1846 case LL_IOC_RECREATE_OBJ:
1847 RETURN(ll_lov_recreate_obj(inode, arg));
1848 case LL_IOC_RECREATE_FID:
1849 RETURN(ll_lov_recreate_fid(inode, arg));
1850 case FSFILT_IOC_FIEMAP:
1851 RETURN(ll_ioctl_fiemap(inode, arg));
1852 case FSFILT_IOC_GETFLAGS:
1853 case FSFILT_IOC_SETFLAGS:
1854 RETURN(ll_iocontrol(inode, file, cmd, arg));
1855 case FSFILT_IOC_GETVERSION_OLD:
1856 case FSFILT_IOC_GETVERSION:
1857 RETURN(put_user(inode->i_generation, (int *)arg));
1858 case LL_IOC_GROUP_LOCK:
1859 RETURN(ll_get_grouplock(inode, file, arg));
1860 case LL_IOC_GROUP_UNLOCK:
1861 RETURN(ll_put_grouplock(inode, file, arg));
1862 case IOC_OBD_STATFS:
1863 RETURN(ll_obd_statfs(inode, (void *)arg));
1865 /* We need to special case any other ioctls we want to handle,
1866 * to send them to the MDS/OST as appropriate and to properly
1867 * network encode the arg field.
1868 case FSFILT_IOC_SETVERSION_OLD:
1869 case FSFILT_IOC_SETVERSION:
1871 case LL_IOC_FLUSHCTX:
1872 RETURN(ll_flush_ctx(inode));
1873 case LL_IOC_PATH2FID: {
1874 if (cfs_copy_to_user((void *)arg, ll_inode2fid(inode),
1875 sizeof(struct lu_fid)))
1880 case OBD_IOC_FID2PATH:
1881 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1882 case LL_IOC_DATA_VERSION: {
1883 struct ioc_data_version idv;
1886 if (cfs_copy_from_user(&idv, (char *)arg, sizeof(idv)))
1889 rc = ll_data_version(inode, &idv.idv_version,
1890 !(idv.idv_flags & LL_DV_NOFLUSH));
1893 cfs_copy_to_user((char *) arg, &idv, sizeof(idv)))
1899 case LL_IOC_GET_MDTIDX: {
1902 mdtidx = ll_get_mdt_idx(inode);
1906 if (put_user((int)mdtidx, (int*)arg))
1911 case OBD_IOC_GETDTNAME:
1912 case OBD_IOC_GETMDNAME:
1913 RETURN(ll_get_obd_name(inode, cmd, arg));
1918 ll_iocontrol_call(inode, file, cmd, arg, &err))
1921 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1927 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1929 struct inode *inode = file->f_dentry->d_inode;
1932 retval = offset + ((origin == 2) ? i_size_read(inode) :
1933 (origin == 1) ? file->f_pos : 0);
1934 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%s)\n",
1935 inode->i_ino, inode->i_generation, inode, retval, retval,
1936 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1937 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1939 if (origin == 2) { /* SEEK_END */
1942 rc = ll_glimpse_size(inode);
1946 offset += i_size_read(inode);
1947 } else if (origin == 1) { /* SEEK_CUR */
1948 offset += file->f_pos;
1952 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1953 if (offset != file->f_pos) {
1954 file->f_pos = offset;
1962 #ifdef HAVE_FLUSH_OWNER_ID
1963 int ll_flush(struct file *file, fl_owner_t id)
1965 int ll_flush(struct file *file)
1968 struct inode *inode = file->f_dentry->d_inode;
1969 struct ll_inode_info *lli = ll_i2info(inode);
1970 struct lov_stripe_md *lsm = lli->lli_smd;
1973 LASSERT(!S_ISDIR(inode->i_mode));
1975 /* the application should know write failure already. */
1976 if (lli->lli_write_rc)
1979 /* catch async errors that were recorded back when async writeback
1980 * failed for pages in this mapping. */
1981 rc = lli->lli_async_rc;
1982 lli->lli_async_rc = 0;
1984 err = lov_test_and_clear_async_rc(lsm);
1989 return rc ? -EIO : 0;
1992 #ifndef HAVE_FILE_FSYNC_2ARGS
1993 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1995 int ll_fsync(struct file *file, int data)
1998 struct inode *inode = file->f_dentry->d_inode;
1999 struct ll_inode_info *lli = ll_i2info(inode);
2000 struct lov_stripe_md *lsm = lli->lli_smd;
2001 struct ptlrpc_request *req;
2002 struct obd_capa *oc;
2005 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2006 inode->i_generation, inode);
2007 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2009 /* fsync's caller has already called _fdata{sync,write}, we want
2010 * that IO to finish before calling the osc and mdc sync methods */
2011 rc = filemap_fdatawait(inode->i_mapping);
2013 /* catch async errors that were recorded back when async writeback
2014 * failed for pages in this mapping. */
2015 if (!S_ISDIR(inode->i_mode)) {
2016 err = lli->lli_async_rc;
2017 lli->lli_async_rc = 0;
2021 err = lov_test_and_clear_async_rc(lsm);
2027 oc = ll_mdscapa_get(inode);
2028 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2034 ptlrpc_req_finished(req);
2037 struct obd_info *oinfo;
2039 OBD_ALLOC_PTR(oinfo);
2041 RETURN(rc ? rc : -ENOMEM);
2042 OBDO_ALLOC(oinfo->oi_oa);
2043 if (!oinfo->oi_oa) {
2044 OBD_FREE_PTR(oinfo);
2045 RETURN(rc ? rc : -ENOMEM);
2047 oinfo->oi_oa->o_id = lsm->lsm_object_id;
2048 oinfo->oi_oa->o_seq = lsm->lsm_object_seq;
2049 oinfo->oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2050 obdo_from_inode(oinfo->oi_oa, inode, &ll_i2info(inode)->lli_fid,
2051 OBD_MD_FLTYPE | OBD_MD_FLATIME |
2052 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2055 oinfo->oi_capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2056 err = obd_sync_rqset(ll_i2sbi(inode)->ll_dt_exp, oinfo, 0,
2058 capa_put(oinfo->oi_capa);
2061 OBDO_FREE(oinfo->oi_oa);
2062 OBD_FREE_PTR(oinfo);
2063 lli->lli_write_rc = rc < 0 ? rc : 0;
2069 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2071 struct inode *inode = file->f_dentry->d_inode;
2072 struct ll_sb_info *sbi = ll_i2sbi(inode);
2073 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2074 .ei_cb_cp =ldlm_flock_completion_ast,
2075 .ei_cbdata = file_lock };
2076 struct md_op_data *op_data;
2077 struct lustre_handle lockh = {0};
2078 ldlm_policy_data_t flock = {{0}};
2083 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2084 inode->i_ino, file_lock);
2086 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2088 if (file_lock->fl_flags & FL_FLOCK) {
2089 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2090 /* flocks are whole-file locks */
2091 flock.l_flock.end = OFFSET_MAX;
2092 /* For flocks owner is determined by the local file desctiptor*/
2093 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2094 } else if (file_lock->fl_flags & FL_POSIX) {
2095 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2096 flock.l_flock.start = file_lock->fl_start;
2097 flock.l_flock.end = file_lock->fl_end;
2101 flock.l_flock.pid = file_lock->fl_pid;
2103 /* Somewhat ugly workaround for svc lockd.
2104 * lockd installs custom fl_lmops->fl_compare_owner that checks
2105 * for the fl_owner to be the same (which it always is on local node
2106 * I guess between lockd processes) and then compares pid.
2107 * As such we assign pid to the owner field to make it all work,
2108 * conflict with normal locks is unlikely since pid space and
2109 * pointer space for current->files are not intersecting */
2110 if (file_lock->fl_lmops && file_lock->fl_lmops->fl_compare_owner)
2111 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2113 switch (file_lock->fl_type) {
2115 einfo.ei_mode = LCK_PR;
2118 /* An unlock request may or may not have any relation to
2119 * existing locks so we may not be able to pass a lock handle
2120 * via a normal ldlm_lock_cancel() request. The request may even
2121 * unlock a byte range in the middle of an existing lock. In
2122 * order to process an unlock request we need all of the same
2123 * information that is given with a normal read or write record
2124 * lock request. To avoid creating another ldlm unlock (cancel)
2125 * message we'll treat a LCK_NL flock request as an unlock. */
2126 einfo.ei_mode = LCK_NL;
2129 einfo.ei_mode = LCK_PW;
2132 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2133 file_lock->fl_type);
2148 flags = LDLM_FL_BLOCK_NOWAIT;
2154 flags = LDLM_FL_TEST_LOCK;
2155 /* Save the old mode so that if the mode in the lock changes we
2156 * can decrement the appropriate reader or writer refcount. */
2157 file_lock->fl_type = einfo.ei_mode;
2160 CERROR("unknown fcntl lock command: %d\n", cmd);
2164 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2165 LUSTRE_OPC_ANY, NULL);
2166 if (IS_ERR(op_data))
2167 RETURN(PTR_ERR(op_data));
2169 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2170 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2171 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2173 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2174 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2176 ll_finish_md_op_data(op_data);
2178 if ((file_lock->fl_flags & FL_FLOCK) &&
2179 (rc == 0 || file_lock->fl_type == F_UNLCK))
2180 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2181 #ifdef HAVE_F_OP_FLOCK
2182 if ((file_lock->fl_flags & FL_POSIX) &&
2183 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2184 !(flags & LDLM_FL_TEST_LOCK))
2185 posix_lock_file_wait(file, file_lock);
2191 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2199 * test if some locks matching bits and l_req_mode are acquired
2200 * - bits can be in different locks
2201 * - if found clear the common lock bits in *bits
2202 * - the bits not found, are kept in *bits
2204 * \param bits [IN] searched lock bits [IN]
2205 * \param l_req_mode [IN] searched lock mode
2206 * \retval boolean, true iff all bits are found
2208 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2210 struct lustre_handle lockh;
2211 ldlm_policy_data_t policy;
2212 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2213 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2222 fid = &ll_i2info(inode)->lli_fid;
2223 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2224 ldlm_lockname[mode]);
2226 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2227 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2228 policy.l_inodebits.bits = *bits & (1 << i);
2229 if (policy.l_inodebits.bits == 0)
2232 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2233 &policy, mode, &lockh)) {
2234 struct ldlm_lock *lock;
2236 lock = ldlm_handle2lock(&lockh);
2239 ~(lock->l_policy_data.l_inodebits.bits);
2240 LDLM_LOCK_PUT(lock);
2242 *bits &= ~policy.l_inodebits.bits;
2249 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2250 struct lustre_handle *lockh)
2252 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2258 fid = &ll_i2info(inode)->lli_fid;
2259 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2261 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2262 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2263 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2267 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2268 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2269 * and return success */
2271 /* This path cannot be hit for regular files unless in
2272 * case of obscure races, so no need to to validate
2274 if (!S_ISREG(inode->i_mode) &&
2275 !S_ISDIR(inode->i_mode))
2280 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2288 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2291 struct inode *inode = dentry->d_inode;
2292 struct ptlrpc_request *req = NULL;
2293 struct obd_export *exp;
2298 CERROR("REPORT THIS LINE TO PETER\n");
2302 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2303 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2305 exp = ll_i2mdexp(inode);
2307 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2308 * But under CMD case, it caused some lock issues, should be fixed
2309 * with new CMD ibits lock. See bug 12718 */
2310 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2311 struct lookup_intent oit = { .it_op = IT_GETATTR };
2312 struct md_op_data *op_data;
2314 if (ibits == MDS_INODELOCK_LOOKUP)
2315 oit.it_op = IT_LOOKUP;
2317 /* Call getattr by fid, so do not provide name at all. */
2318 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2319 dentry->d_inode, NULL, 0, 0,
2320 LUSTRE_OPC_ANY, NULL);
2321 if (IS_ERR(op_data))
2322 RETURN(PTR_ERR(op_data));
2324 oit.it_create_mode |= M_CHECK_STALE;
2325 rc = md_intent_lock(exp, op_data, NULL, 0,
2326 /* we are not interested in name
2329 ll_md_blocking_ast, 0);
2330 ll_finish_md_op_data(op_data);
2331 oit.it_create_mode &= ~M_CHECK_STALE;
2333 rc = ll_inode_revalidate_fini(inode, rc);
2337 rc = ll_revalidate_it_finish(req, &oit, dentry);
2339 ll_intent_release(&oit);
2343 /* Unlinked? Unhash dentry, so it is not picked up later by
2344 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2345 here to preserve get_cwd functionality on 2.6.
2347 if (!dentry->d_inode->i_nlink) {
2348 cfs_spin_lock(&ll_lookup_lock);
2349 spin_lock(&dcache_lock);
2350 ll_drop_dentry(dentry);
2351 spin_unlock(&dcache_lock);
2352 cfs_spin_unlock(&ll_lookup_lock);
2355 ll_lookup_finish_locks(&oit, dentry);
2356 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2357 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2358 obd_valid valid = OBD_MD_FLGETATTR;
2359 struct md_op_data *op_data;
2362 if (S_ISREG(inode->i_mode)) {
2363 rc = ll_get_max_mdsize(sbi, &ealen);
2366 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2369 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2370 0, ealen, LUSTRE_OPC_ANY,
2372 if (IS_ERR(op_data))
2373 RETURN(PTR_ERR(op_data));
2375 op_data->op_valid = valid;
2376 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2377 * capa for this inode. Because we only keep capas of dirs
2379 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2380 ll_finish_md_op_data(op_data);
2382 rc = ll_inode_revalidate_fini(inode, rc);
2386 rc = ll_prep_inode(&inode, req, NULL);
2389 ptlrpc_req_finished(req);
2393 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2396 struct inode *inode = dentry->d_inode;
2400 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2402 /* if object not yet allocated, don't validate size */
2403 if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL) {
2404 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2405 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2406 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2410 /* ll_glimpse_size will prefer locally cached writes if they extend
2414 rc = ll_glimpse_size(inode);
2419 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2420 struct lookup_intent *it, struct kstat *stat)
2422 struct inode *inode = de->d_inode;
2423 struct ll_sb_info *sbi = ll_i2sbi(inode);
2424 struct ll_inode_info *lli = ll_i2info(inode);
2427 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2428 MDS_INODELOCK_LOOKUP);
2429 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2434 stat->dev = inode->i_sb->s_dev;
2435 if (ll_need_32bit_api(sbi))
2436 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2438 stat->ino = inode->i_ino;
2439 stat->mode = inode->i_mode;
2440 stat->nlink = inode->i_nlink;
2441 stat->uid = inode->i_uid;
2442 stat->gid = inode->i_gid;
2443 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2444 stat->atime = inode->i_atime;
2445 stat->mtime = inode->i_mtime;
2446 stat->ctime = inode->i_ctime;
2447 #ifdef HAVE_INODE_BLKSIZE
2448 stat->blksize = inode->i_blksize;
2450 stat->blksize = 1 << inode->i_blkbits;
2453 stat->size = i_size_read(inode);
2454 stat->blocks = inode->i_blocks;
2458 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2460 struct lookup_intent it = { .it_op = IT_GETATTR };
2462 return ll_getattr_it(mnt, de, &it, stat);
2465 #ifdef HAVE_LINUX_FIEMAP_H
2466 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2467 __u64 start, __u64 len)
2471 struct ll_user_fiemap *fiemap;
2472 unsigned int extent_count = fieinfo->fi_extents_max;
2474 num_bytes = sizeof(*fiemap) + (extent_count *
2475 sizeof(struct ll_fiemap_extent));
2476 OBD_ALLOC_LARGE(fiemap, num_bytes);
2481 fiemap->fm_flags = fieinfo->fi_flags;
2482 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2483 fiemap->fm_start = start;
2484 fiemap->fm_length = len;
2485 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2486 sizeof(struct ll_fiemap_extent));
2488 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2490 fieinfo->fi_flags = fiemap->fm_flags;
2491 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2492 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2493 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2495 OBD_FREE_LARGE(fiemap, num_bytes);
2502 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2503 lustre_check_acl(struct inode *inode, int mask, unsigned int flags)
2505 lustre_check_acl(struct inode *inode, int mask)
2508 #ifdef CONFIG_FS_POSIX_ACL
2509 struct ll_inode_info *lli = ll_i2info(inode);
2510 struct posix_acl *acl;
2514 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2515 if (flags & IPERM_FLAG_RCU)
2518 cfs_spin_lock(&lli->lli_lock);
2519 acl = posix_acl_dup(lli->lli_posix_acl);
2520 cfs_spin_unlock(&lli->lli_lock);
2525 rc = posix_acl_permission(inode, acl, mask);
2526 posix_acl_release(acl);
2534 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
2535 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
2537 # ifdef HAVE_INODE_PERMISION_2ARGS
2538 int ll_inode_permission(struct inode *inode, int mask)
2540 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2547 /* as root inode are NOT getting validated in lookup operation,
2548 * need to do it before permission check. */
2550 if (inode == inode->i_sb->s_root->d_inode) {
2551 struct lookup_intent it = { .it_op = IT_LOOKUP };
2553 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2554 MDS_INODELOCK_LOOKUP);
2559 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2560 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2562 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2563 return lustre_check_remote_perm(inode, mask);
2565 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2566 rc = ll_generic_permission(inode, mask, flags, lustre_check_acl);
2571 #ifdef HAVE_FILE_READV
2572 #define READ_METHOD readv
2573 #define READ_FUNCTION ll_file_readv
2574 #define WRITE_METHOD writev
2575 #define WRITE_FUNCTION ll_file_writev
2577 #define READ_METHOD aio_read
2578 #define READ_FUNCTION ll_file_aio_read
2579 #define WRITE_METHOD aio_write
2580 #define WRITE_FUNCTION ll_file_aio_write
2583 /* -o localflock - only provides locally consistent flock locks */
2584 struct file_operations ll_file_operations = {
2585 .read = ll_file_read,
2586 .READ_METHOD = READ_FUNCTION,
2587 .write = ll_file_write,
2588 .WRITE_METHOD = WRITE_FUNCTION,
2589 #ifdef HAVE_UNLOCKED_IOCTL
2590 .unlocked_ioctl = ll_file_ioctl,
2592 .ioctl = ll_file_ioctl,
2594 .open = ll_file_open,
2595 .release = ll_file_release,
2596 .mmap = ll_file_mmap,
2597 .llseek = ll_file_seek,
2598 #ifdef HAVE_KERNEL_SENDFILE
2599 .sendfile = ll_file_sendfile,
2601 #ifdef HAVE_KERNEL_SPLICE_READ
2602 .splice_read = ll_file_splice_read,
2608 struct file_operations ll_file_operations_flock = {
2609 .read = ll_file_read,
2610 .READ_METHOD = READ_FUNCTION,
2611 .write = ll_file_write,
2612 .WRITE_METHOD = WRITE_FUNCTION,
2613 #ifdef HAVE_UNLOCKED_IOCTL
2614 .unlocked_ioctl = ll_file_ioctl,
2616 .ioctl = ll_file_ioctl,
2618 .open = ll_file_open,
2619 .release = ll_file_release,
2620 .mmap = ll_file_mmap,
2621 .llseek = ll_file_seek,
2622 #ifdef HAVE_KERNEL_SENDFILE
2623 .sendfile = ll_file_sendfile,
2625 #ifdef HAVE_KERNEL_SPLICE_READ
2626 .splice_read = ll_file_splice_read,
2630 #ifdef HAVE_F_OP_FLOCK
2631 .flock = ll_file_flock,
2633 .lock = ll_file_flock
2636 /* These are for -o noflock - to return ENOSYS on flock calls */
2637 struct file_operations ll_file_operations_noflock = {
2638 .read = ll_file_read,
2639 .READ_METHOD = READ_FUNCTION,
2640 .write = ll_file_write,
2641 .WRITE_METHOD = WRITE_FUNCTION,
2642 #ifdef HAVE_UNLOCKED_IOCTL
2643 .unlocked_ioctl = ll_file_ioctl,
2645 .ioctl = ll_file_ioctl,
2647 .open = ll_file_open,
2648 .release = ll_file_release,
2649 .mmap = ll_file_mmap,
2650 .llseek = ll_file_seek,
2651 #ifdef HAVE_KERNEL_SENDFILE
2652 .sendfile = ll_file_sendfile,
2654 #ifdef HAVE_KERNEL_SPLICE_READ
2655 .splice_read = ll_file_splice_read,
2659 #ifdef HAVE_F_OP_FLOCK
2660 .flock = ll_file_noflock,
2662 .lock = ll_file_noflock
2665 struct inode_operations ll_file_inode_operations = {
2666 .setattr = ll_setattr,
2667 .truncate = ll_truncate,
2668 .getattr = ll_getattr,
2669 .permission = ll_inode_permission,
2670 .setxattr = ll_setxattr,
2671 .getxattr = ll_getxattr,
2672 .listxattr = ll_listxattr,
2673 .removexattr = ll_removexattr,
2674 #ifdef HAVE_LINUX_FIEMAP_H
2675 .fiemap = ll_fiemap,
2679 /* dynamic ioctl number support routins */
2680 static struct llioc_ctl_data {
2681 cfs_rw_semaphore_t ioc_sem;
2682 cfs_list_t ioc_head;
2684 __RWSEM_INITIALIZER(llioc.ioc_sem),
2685 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2690 cfs_list_t iocd_list;
2691 unsigned int iocd_size;
2692 llioc_callback_t iocd_cb;
2693 unsigned int iocd_count;
2694 unsigned int iocd_cmd[0];
2697 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2700 struct llioc_data *in_data = NULL;
2703 if (cb == NULL || cmd == NULL ||
2704 count > LLIOC_MAX_CMD || count < 0)
2707 size = sizeof(*in_data) + count * sizeof(unsigned int);
2708 OBD_ALLOC(in_data, size);
2709 if (in_data == NULL)
2712 memset(in_data, 0, sizeof(*in_data));
2713 in_data->iocd_size = size;
2714 in_data->iocd_cb = cb;
2715 in_data->iocd_count = count;
2716 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2718 cfs_down_write(&llioc.ioc_sem);
2719 cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2720 cfs_up_write(&llioc.ioc_sem);
2725 void ll_iocontrol_unregister(void *magic)
2727 struct llioc_data *tmp;
2732 cfs_down_write(&llioc.ioc_sem);
2733 cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2735 unsigned int size = tmp->iocd_size;
2737 cfs_list_del(&tmp->iocd_list);
2738 cfs_up_write(&llioc.ioc_sem);
2740 OBD_FREE(tmp, size);
2744 cfs_up_write(&llioc.ioc_sem);
2746 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2749 EXPORT_SYMBOL(ll_iocontrol_register);
2750 EXPORT_SYMBOL(ll_iocontrol_unregister);
2752 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2753 unsigned int cmd, unsigned long arg, int *rcp)
2755 enum llioc_iter ret = LLIOC_CONT;
2756 struct llioc_data *data;
2757 int rc = -EINVAL, i;
2759 cfs_down_read(&llioc.ioc_sem);
2760 cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2761 for (i = 0; i < data->iocd_count; i++) {
2762 if (cmd != data->iocd_cmd[i])
2765 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2769 if (ret == LLIOC_STOP)
2772 cfs_up_read(&llioc.ioc_sem);