4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
94 ll_inode_to_ext_flags(inode->i_flags);
95 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
97 op_data->op_handle = *fh;
98 op_data->op_capa1 = ll_mdscapa_get(inode);
100 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
101 op_data->op_bias |= MDS_DATA_MODIFIED;
105 * Closes the IO epoch and packs all the attributes into @op_data for
108 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
109 struct obd_client_handle *och)
113 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
114 ATTR_MTIME | ATTR_MTIME_SET |
115 ATTR_CTIME | ATTR_CTIME_SET;
117 if (!(och->och_flags & FMODE_WRITE))
120 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
121 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
123 ll_ioepoch_close(inode, op_data, &och, 0);
126 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
127 ll_prep_md_op_data(op_data, inode, NULL, NULL,
128 0, 0, LUSTRE_OPC_ANY, NULL);
132 static int ll_close_inode_openhandle(struct obd_export *md_exp,
134 struct obd_client_handle *och,
135 const __u64 *data_version)
137 struct obd_export *exp = ll_i2mdexp(inode);
138 struct md_op_data *op_data;
139 struct ptlrpc_request *req = NULL;
140 struct obd_device *obd = class_exp2obd(exp);
147 * XXX: in case of LMV, is this correct to access
150 CERROR("Invalid MDC connection handle "LPX64"\n",
151 ll_i2mdexp(inode)->exp_handle.h_cookie);
155 OBD_ALLOC_PTR(op_data);
157 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
159 ll_prepare_close(inode, op_data, och);
160 if (data_version != NULL) {
161 /* Pass in data_version implies release. */
162 op_data->op_bias |= MDS_HSM_RELEASE;
163 op_data->op_data_version = *data_version;
164 op_data->op_lease_handle = och->och_lease_handle;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
167 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
168 rc = md_close(md_exp, op_data, och->och_mod, &req);
170 /* This close must have the epoch closed. */
171 LASSERT(epoch_close);
172 /* MDS has instructed us to obtain Size-on-MDS attribute from
173 * OSTs and send setattr to back to MDS. */
174 rc = ll_som_update(inode, op_data);
176 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
177 " failed: rc = %d\n",
178 ll_i2mdexp(inode)->exp_obd->obd_name,
179 PFID(ll_inode2fid(inode)), rc);
183 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
184 ll_i2mdexp(inode)->exp_obd->obd_name,
185 PFID(ll_inode2fid(inode)), rc);
188 /* DATA_MODIFIED flag was successfully sent on close, cancel data
189 * modification flag. */
190 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
191 struct ll_inode_info *lli = ll_i2info(inode);
193 spin_lock(&lli->lli_lock);
194 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
195 spin_unlock(&lli->lli_lock);
199 rc = ll_objects_destroy(req, inode);
201 CERROR("%s: inode "DFID
202 " ll_objects destroy: rc = %d\n",
203 ll_i2mdexp(inode)->exp_obd->obd_name,
204 PFID(ll_inode2fid(inode)), rc);
207 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
208 struct mdt_body *body;
209 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
210 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
214 ll_finish_md_op_data(op_data);
218 if (exp_connect_som(exp) && !epoch_close &&
219 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
220 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
222 md_clear_open_replay_data(md_exp, och);
223 /* Free @och if it is not waiting for DONE_WRITING. */
224 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
227 if (req) /* This is close request */
228 ptlrpc_req_finished(req);
232 int ll_md_real_close(struct inode *inode, fmode_t fmode)
234 struct ll_inode_info *lli = ll_i2info(inode);
235 struct obd_client_handle **och_p;
236 struct obd_client_handle *och;
241 if (fmode & FMODE_WRITE) {
242 och_p = &lli->lli_mds_write_och;
243 och_usecount = &lli->lli_open_fd_write_count;
244 } else if (fmode & FMODE_EXEC) {
245 och_p = &lli->lli_mds_exec_och;
246 och_usecount = &lli->lli_open_fd_exec_count;
248 LASSERT(fmode & FMODE_READ);
249 och_p = &lli->lli_mds_read_och;
250 och_usecount = &lli->lli_open_fd_read_count;
253 mutex_lock(&lli->lli_och_mutex);
254 if (*och_usecount > 0) {
255 /* There are still users of this handle, so skip
257 mutex_unlock(&lli->lli_och_mutex);
263 mutex_unlock(&lli->lli_och_mutex);
266 /* There might be a race and this handle may already
268 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
275 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
278 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
279 struct ll_inode_info *lli = ll_i2info(inode);
283 /* clear group lock, if present */
284 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
285 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
287 if (fd->fd_lease_och != NULL) {
290 /* Usually the lease is not released when the
291 * application crashed, we need to release here. */
292 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
293 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
294 PFID(&lli->lli_fid), rc, lease_broken);
296 fd->fd_lease_och = NULL;
299 if (fd->fd_och != NULL) {
300 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
305 /* Let's see if we have good enough OPEN lock on the file and if
306 we can skip talking to MDS */
307 if (file->f_dentry->d_inode) { /* Can this ever be false? */
309 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
310 struct lustre_handle lockh;
311 struct inode *inode = file->f_dentry->d_inode;
312 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
314 mutex_lock(&lli->lli_och_mutex);
315 if (fd->fd_omode & FMODE_WRITE) {
317 LASSERT(lli->lli_open_fd_write_count);
318 lli->lli_open_fd_write_count--;
319 } else if (fd->fd_omode & FMODE_EXEC) {
321 LASSERT(lli->lli_open_fd_exec_count);
322 lli->lli_open_fd_exec_count--;
325 LASSERT(lli->lli_open_fd_read_count);
326 lli->lli_open_fd_read_count--;
328 mutex_unlock(&lli->lli_och_mutex);
330 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
331 LDLM_IBITS, &policy, lockmode,
333 rc = ll_md_real_close(file->f_dentry->d_inode,
337 CERROR("released file has negative dentry: file = %p, "
338 "dentry = %p, name = %s\n",
339 file, file->f_dentry, file->f_dentry->d_name.name);
343 LUSTRE_FPRIVATE(file) = NULL;
344 ll_file_data_put(fd);
345 ll_capa_close(inode);
350 /* While this returns an error code, fput() the caller does not, so we need
351 * to make every effort to clean up all of our state here. Also, applications
352 * rarely check close errors and even if an error is returned they will not
353 * re-try the close call.
355 int ll_file_release(struct inode *inode, struct file *file)
357 struct ll_file_data *fd;
358 struct ll_sb_info *sbi = ll_i2sbi(inode);
359 struct ll_inode_info *lli = ll_i2info(inode);
363 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
364 PFID(ll_inode2fid(inode)), inode);
366 #ifdef CONFIG_FS_POSIX_ACL
367 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
368 inode == inode->i_sb->s_root->d_inode) {
369 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
372 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
373 fd->fd_flags &= ~LL_FILE_RMTACL;
374 rct_del(&sbi->ll_rct, current_pid());
375 et_search_free(&sbi->ll_et, current_pid());
380 if (inode->i_sb->s_root != file->f_dentry)
381 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
382 fd = LUSTRE_FPRIVATE(file);
385 /* The last ref on @file, maybe not the the owner pid of statahead,
386 * because parent and child process can share the same file handle. */
387 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
388 ll_deauthorize_statahead(inode, fd);
390 if (inode->i_sb->s_root == file->f_dentry) {
391 LUSTRE_FPRIVATE(file) = NULL;
392 ll_file_data_put(fd);
396 if (!S_ISDIR(inode->i_mode)) {
397 if (lli->lli_clob != NULL)
398 lov_read_and_clear_async_rc(lli->lli_clob);
399 lli->lli_async_rc = 0;
402 rc = ll_md_close(sbi->ll_md_exp, inode, file);
404 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
405 libcfs_debug_dumplog();
410 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
411 struct lookup_intent *itp)
413 struct dentry *de = file->f_dentry;
414 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
415 struct dentry *parent = de->d_parent;
416 const char *name = NULL;
418 struct md_op_data *op_data;
419 struct ptlrpc_request *req = NULL;
423 LASSERT(parent != NULL);
424 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
426 /* if server supports open-by-fid, or file name is invalid, don't pack
427 * name in open request */
428 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
429 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
430 name = de->d_name.name;
431 len = de->d_name.len;
434 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
435 name, len, 0, LUSTRE_OPC_ANY, NULL);
437 RETURN(PTR_ERR(op_data));
438 op_data->op_data = lmm;
439 op_data->op_data_size = lmmsize;
441 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
442 &ll_md_blocking_ast, 0);
443 ll_finish_md_op_data(op_data);
445 /* reason for keep own exit path - don`t flood log
446 * with messages with -ESTALE errors.
448 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
449 it_open_error(DISP_OPEN_OPEN, itp))
451 ll_release_openhandle(de, itp);
455 if (it_disposition(itp, DISP_LOOKUP_NEG))
456 GOTO(out, rc = -ENOENT);
458 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
459 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
460 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
464 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
465 if (!rc && itp->d.lustre.it_lock_mode)
466 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
469 ptlrpc_req_finished(req);
470 ll_intent_drop_lock(itp);
476 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
477 * not believe attributes if a few ioepoch holders exist. Attributes for
478 * previous ioepoch if new one is opened are also skipped by MDS.
480 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
482 if (ioepoch && lli->lli_ioepoch != ioepoch) {
483 lli->lli_ioepoch = ioepoch;
484 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
485 ioepoch, PFID(&lli->lli_fid));
489 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
490 struct obd_client_handle *och)
492 struct ptlrpc_request *req = it->d.lustre.it_data;
493 struct mdt_body *body;
495 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
496 och->och_fh = body->mbo_handle;
497 och->och_fid = body->mbo_fid1;
498 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
499 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
500 och->och_flags = it->it_flags;
502 return md_set_open_replay_data(md_exp, och, it);
505 static int ll_local_open(struct file *file, struct lookup_intent *it,
506 struct ll_file_data *fd, struct obd_client_handle *och)
508 struct inode *inode = file->f_dentry->d_inode;
509 struct ll_inode_info *lli = ll_i2info(inode);
512 LASSERT(!LUSTRE_FPRIVATE(file));
517 struct ptlrpc_request *req = it->d.lustre.it_data;
518 struct mdt_body *body;
521 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
525 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
526 ll_ioepoch_open(lli, body->mbo_ioepoch);
529 LUSTRE_FPRIVATE(file) = fd;
530 ll_readahead_init(inode, &fd->fd_ras);
531 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
533 /* ll_cl_context initialize */
534 rwlock_init(&fd->fd_lock);
535 INIT_LIST_HEAD(&fd->fd_lccs);
540 /* Open a file, and (for the very first open) create objects on the OSTs at
541 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
542 * creation or open until ll_lov_setstripe() ioctl is called.
544 * If we already have the stripe MD locally then we don't request it in
545 * md_open(), by passing a lmm_size = 0.
547 * It is up to the application to ensure no other processes open this file
548 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
549 * used. We might be able to avoid races of that sort by getting lli_open_sem
550 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
551 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
553 int ll_file_open(struct inode *inode, struct file *file)
555 struct ll_inode_info *lli = ll_i2info(inode);
556 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
557 .it_flags = file->f_flags };
558 struct obd_client_handle **och_p = NULL;
559 __u64 *och_usecount = NULL;
560 struct ll_file_data *fd;
564 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
565 PFID(ll_inode2fid(inode)), inode, file->f_flags);
567 it = file->private_data; /* XXX: compat macro */
568 file->private_data = NULL; /* prevent ll_local_open assertion */
570 fd = ll_file_data_get();
572 GOTO(out_openerr, rc = -ENOMEM);
575 if (S_ISDIR(inode->i_mode))
576 ll_authorize_statahead(inode, fd);
578 if (inode->i_sb->s_root == file->f_dentry) {
579 LUSTRE_FPRIVATE(file) = fd;
583 if (!it || !it->d.lustre.it_disposition) {
584 /* Convert f_flags into access mode. We cannot use file->f_mode,
585 * because everything but O_ACCMODE mask was stripped from
587 if ((oit.it_flags + 1) & O_ACCMODE)
589 if (file->f_flags & O_TRUNC)
590 oit.it_flags |= FMODE_WRITE;
592 /* kernel only call f_op->open in dentry_open. filp_open calls
593 * dentry_open after call to open_namei that checks permissions.
594 * Only nfsd_open call dentry_open directly without checking
595 * permissions and because of that this code below is safe. */
596 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
597 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
599 /* We do not want O_EXCL here, presumably we opened the file
600 * already? XXX - NFS implications? */
601 oit.it_flags &= ~O_EXCL;
603 /* bug20584, if "it_flags" contains O_CREAT, the file will be
604 * created if necessary, then "IT_CREAT" should be set to keep
605 * consistent with it */
606 if (oit.it_flags & O_CREAT)
607 oit.it_op |= IT_CREAT;
613 /* Let's see if we have file open on MDS already. */
614 if (it->it_flags & FMODE_WRITE) {
615 och_p = &lli->lli_mds_write_och;
616 och_usecount = &lli->lli_open_fd_write_count;
617 } else if (it->it_flags & FMODE_EXEC) {
618 och_p = &lli->lli_mds_exec_och;
619 och_usecount = &lli->lli_open_fd_exec_count;
621 och_p = &lli->lli_mds_read_och;
622 och_usecount = &lli->lli_open_fd_read_count;
625 mutex_lock(&lli->lli_och_mutex);
626 if (*och_p) { /* Open handle is present */
627 if (it_disposition(it, DISP_OPEN_OPEN)) {
628 /* Well, there's extra open request that we do not need,
629 let's close it somehow. This will decref request. */
630 rc = it_open_error(DISP_OPEN_OPEN, it);
632 mutex_unlock(&lli->lli_och_mutex);
633 GOTO(out_openerr, rc);
636 ll_release_openhandle(file->f_dentry, it);
640 rc = ll_local_open(file, it, fd, NULL);
643 mutex_unlock(&lli->lli_och_mutex);
644 GOTO(out_openerr, rc);
647 LASSERT(*och_usecount == 0);
648 if (!it->d.lustre.it_disposition) {
649 /* We cannot just request lock handle now, new ELC code
650 means that one of other OPEN locks for this file
651 could be cancelled, and since blocking ast handler
652 would attempt to grab och_mutex as well, that would
653 result in a deadlock */
654 mutex_unlock(&lli->lli_och_mutex);
656 * Normally called under two situations:
658 * 2. A race/condition on MDS resulting in no open
659 * handle to be returned from LOOKUP|OPEN request,
660 * for example if the target entry was a symlink.
662 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
664 * Always specify MDS_OPEN_BY_FID because we don't want
665 * to get file with different fid.
667 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
668 rc = ll_intent_file_open(file, NULL, 0, it);
670 GOTO(out_openerr, rc);
674 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
676 GOTO(out_och_free, rc = -ENOMEM);
680 /* md_intent_lock() didn't get a request ref if there was an
681 * open error, so don't do cleanup on the request here
683 /* XXX (green): Should not we bail out on any error here, not
684 * just open error? */
685 rc = it_open_error(DISP_OPEN_OPEN, it);
687 GOTO(out_och_free, rc);
689 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
690 "inode %p: disposition %x, status %d\n", inode,
691 it_disposition(it, ~0), it->d.lustre.it_status);
693 rc = ll_local_open(file, it, fd, *och_p);
695 GOTO(out_och_free, rc);
697 mutex_unlock(&lli->lli_och_mutex);
700 /* Must do this outside lli_och_mutex lock to prevent deadlock where
701 different kind of OPEN lock for this same inode gets cancelled
702 by ldlm_cancel_lru */
703 if (!S_ISREG(inode->i_mode))
704 GOTO(out_och_free, rc);
708 if (!lli->lli_has_smd &&
709 (cl_is_lov_delay_create(file->f_flags) ||
710 (file->f_mode & FMODE_WRITE) == 0)) {
711 CDEBUG(D_INODE, "object creation was delayed\n");
712 GOTO(out_och_free, rc);
714 cl_lov_delay_create_clear(&file->f_flags);
715 GOTO(out_och_free, rc);
719 if (och_p && *och_p) {
720 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
721 *och_p = NULL; /* OBD_FREE writes some magic there */
724 mutex_unlock(&lli->lli_och_mutex);
727 if (lli->lli_opendir_key == fd)
728 ll_deauthorize_statahead(inode, fd);
730 ll_file_data_put(fd);
732 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
735 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
736 ptlrpc_req_finished(it->d.lustre.it_data);
737 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
743 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
744 struct ldlm_lock_desc *desc, void *data, int flag)
747 struct lustre_handle lockh;
751 case LDLM_CB_BLOCKING:
752 ldlm_lock2handle(lock, &lockh);
753 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
755 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
759 case LDLM_CB_CANCELING:
767 * Acquire a lease and open the file.
769 static struct obd_client_handle *
770 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
773 struct lookup_intent it = { .it_op = IT_OPEN };
774 struct ll_sb_info *sbi = ll_i2sbi(inode);
775 struct md_op_data *op_data;
776 struct ptlrpc_request *req = NULL;
777 struct lustre_handle old_handle = { 0 };
778 struct obd_client_handle *och = NULL;
783 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
784 RETURN(ERR_PTR(-EINVAL));
787 struct ll_inode_info *lli = ll_i2info(inode);
788 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
789 struct obd_client_handle **och_p;
792 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
793 RETURN(ERR_PTR(-EPERM));
795 /* Get the openhandle of the file */
797 mutex_lock(&lli->lli_och_mutex);
798 if (fd->fd_lease_och != NULL) {
799 mutex_unlock(&lli->lli_och_mutex);
803 if (fd->fd_och == NULL) {
804 if (file->f_mode & FMODE_WRITE) {
805 LASSERT(lli->lli_mds_write_och != NULL);
806 och_p = &lli->lli_mds_write_och;
807 och_usecount = &lli->lli_open_fd_write_count;
809 LASSERT(lli->lli_mds_read_och != NULL);
810 och_p = &lli->lli_mds_read_och;
811 och_usecount = &lli->lli_open_fd_read_count;
813 if (*och_usecount == 1) {
820 mutex_unlock(&lli->lli_och_mutex);
821 if (rc < 0) /* more than 1 opener */
824 LASSERT(fd->fd_och != NULL);
825 old_handle = fd->fd_och->och_fh;
830 RETURN(ERR_PTR(-ENOMEM));
832 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
833 LUSTRE_OPC_ANY, NULL);
835 GOTO(out, rc = PTR_ERR(op_data));
837 /* To tell the MDT this openhandle is from the same owner */
838 op_data->op_handle = old_handle;
840 it.it_flags = fmode | open_flags;
841 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
842 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
843 &ll_md_blocking_lease_ast,
844 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
845 * it can be cancelled which may mislead applications that the lease is
847 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
848 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
849 * doesn't deal with openhandle, so normal openhandle will be leaked. */
850 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
851 ll_finish_md_op_data(op_data);
852 ptlrpc_req_finished(req);
854 GOTO(out_release_it, rc);
856 if (it_disposition(&it, DISP_LOOKUP_NEG))
857 GOTO(out_release_it, rc = -ENOENT);
859 rc = it_open_error(DISP_OPEN_OPEN, &it);
861 GOTO(out_release_it, rc);
863 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
864 ll_och_fill(sbi->ll_md_exp, &it, och);
866 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
867 GOTO(out_close, rc = -EOPNOTSUPP);
869 /* already get lease, handle lease lock */
870 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
871 if (it.d.lustre.it_lock_mode == 0 ||
872 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
873 /* open lock must return for lease */
874 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
875 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
876 it.d.lustre.it_lock_bits);
877 GOTO(out_close, rc = -EPROTO);
880 ll_intent_release(&it);
884 /* Cancel open lock */
885 if (it.d.lustre.it_lock_mode != 0) {
886 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
887 it.d.lustre.it_lock_mode);
888 it.d.lustre.it_lock_mode = 0;
889 och->och_lease_handle.cookie = 0ULL;
891 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
893 CERROR("%s: error closing file "DFID": %d\n",
894 ll_get_fsname(inode->i_sb, NULL, 0),
895 PFID(&ll_i2info(inode)->lli_fid), rc2);
896 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
898 ll_intent_release(&it);
906 * Release lease and close the file.
907 * It will check if the lease has ever broken.
909 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
912 struct ldlm_lock *lock;
913 bool cancelled = true;
917 lock = ldlm_handle2lock(&och->och_lease_handle);
919 lock_res_and_lock(lock);
920 cancelled = ldlm_is_cancel(lock);
921 unlock_res_and_lock(lock);
925 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
926 PFID(&ll_i2info(inode)->lli_fid), cancelled);
929 ldlm_cli_cancel(&och->och_lease_handle, 0);
930 if (lease_broken != NULL)
931 *lease_broken = cancelled;
933 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
938 /* Fills the obdo with the attributes for the lsm */
939 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
940 struct obd_capa *capa, struct obdo *obdo,
941 __u64 ioepoch, int dv_flags)
943 struct ptlrpc_request_set *set;
944 struct obd_info oinfo = { { { 0 } } };
949 LASSERT(lsm != NULL);
953 oinfo.oi_oa->o_oi = lsm->lsm_oi;
954 oinfo.oi_oa->o_mode = S_IFREG;
955 oinfo.oi_oa->o_ioepoch = ioepoch;
956 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
957 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
958 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
959 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
960 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
961 OBD_MD_FLDATAVERSION;
962 oinfo.oi_capa = capa;
963 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
964 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
965 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
966 if (dv_flags & LL_DV_WR_FLUSH)
967 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
970 set = ptlrpc_prep_set();
972 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
975 rc = obd_getattr_async(exp, &oinfo, set);
977 rc = ptlrpc_set_wait(set);
978 ptlrpc_set_destroy(set);
981 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
982 OBD_MD_FLATIME | OBD_MD_FLMTIME |
983 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
984 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
985 if (dv_flags & LL_DV_WR_FLUSH &&
986 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
987 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
994 * Performs the getattr on the inode and updates its fields.
995 * If @sync != 0, perform the getattr under the server-side lock.
997 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
998 __u64 ioepoch, int sync)
1000 struct obd_capa *capa = ll_mdscapa_get(inode);
1001 struct lov_stripe_md *lsm;
1005 lsm = ccc_inode_lsm_get(inode);
1006 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1007 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1010 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1012 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1013 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1014 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1015 (unsigned long long)inode->i_blocks,
1016 1UL << inode->i_blkbits);
1018 ccc_inode_lsm_put(inode, lsm);
1022 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1024 struct ll_inode_info *lli = ll_i2info(inode);
1025 struct cl_object *obj = lli->lli_clob;
1026 struct cl_attr *attr = ccc_env_thread_attr(env);
1032 ll_inode_size_lock(inode);
1033 /* merge timestamps the most recently obtained from mds with
1034 timestamps obtained from osts */
1035 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1036 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1037 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1039 lvb.lvb_size = i_size_read(inode);
1040 lvb.lvb_blocks = inode->i_blocks;
1041 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1042 lvb.lvb_atime = LTIME_S(inode->i_atime);
1043 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1045 cl_object_attr_lock(obj);
1046 rc = cl_object_attr_get(env, obj, attr);
1047 cl_object_attr_unlock(obj);
1050 if (lvb.lvb_atime < attr->cat_atime)
1051 lvb.lvb_atime = attr->cat_atime;
1052 if (lvb.lvb_ctime < attr->cat_ctime)
1053 lvb.lvb_ctime = attr->cat_ctime;
1054 if (lvb.lvb_mtime < attr->cat_mtime)
1055 lvb.lvb_mtime = attr->cat_mtime;
1057 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1058 PFID(&lli->lli_fid), attr->cat_size);
1059 cl_isize_write_nolock(inode, attr->cat_size);
1061 inode->i_blocks = attr->cat_blocks;
1063 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1064 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1065 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1067 ll_inode_size_unlock(inode);
1072 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1075 struct obdo obdo = { 0 };
1078 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1080 st->st_size = obdo.o_size;
1081 st->st_blocks = obdo.o_blocks;
1082 st->st_mtime = obdo.o_mtime;
1083 st->st_atime = obdo.o_atime;
1084 st->st_ctime = obdo.o_ctime;
1089 static bool file_is_noatime(const struct file *file)
1091 const struct vfsmount *mnt = file->f_path.mnt;
1092 const struct inode *inode = file->f_path.dentry->d_inode;
1094 /* Adapted from file_accessed() and touch_atime().*/
1095 if (file->f_flags & O_NOATIME)
1098 if (inode->i_flags & S_NOATIME)
1101 if (IS_NOATIME(inode))
1104 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1107 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1110 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1116 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1118 struct inode *inode = file->f_dentry->d_inode;
1120 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1122 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1123 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1124 file->f_flags & O_DIRECT ||
1127 io->ci_obj = ll_i2info(inode)->lli_clob;
1128 io->ci_lockreq = CILR_MAYBE;
1129 if (ll_file_nolock(file)) {
1130 io->ci_lockreq = CILR_NEVER;
1131 io->ci_no_srvlock = 1;
1132 } else if (file->f_flags & O_APPEND) {
1133 io->ci_lockreq = CILR_MANDATORY;
1136 io->ci_noatime = file_is_noatime(file);
1140 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1141 struct file *file, enum cl_io_type iot,
1142 loff_t *ppos, size_t count)
1144 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1145 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1148 struct range_lock range;
1151 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1152 file->f_dentry->d_name.name, iot, *ppos, count);
1155 io = ccc_env_thread_io(env);
1156 ll_io_init(io, file, iot == CIT_WRITE);
1158 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1159 struct vvp_io *vio = vvp_env_io(env);
1160 struct ccc_io *cio = ccc_env_io(env);
1161 bool range_locked = false;
1163 if (file->f_flags & O_APPEND)
1164 range_lock_init(&range, 0, LUSTRE_EOF);
1166 range_lock_init(&range, *ppos, *ppos + count - 1);
1167 cio->cui_fd = LUSTRE_FPRIVATE(file);
1168 vio->cui_io_subtype = args->via_io_subtype;
1170 switch (vio->cui_io_subtype) {
1172 cio->cui_iov = args->u.normal.via_iov;
1173 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1174 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1175 cio->cui_iocb = args->u.normal.via_iocb;
1176 if ((iot == CIT_WRITE) &&
1177 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1178 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1180 result = range_lock(&lli->lli_write_tree,
1185 range_locked = true;
1187 down_read(&lli->lli_trunc_sem);
1190 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1191 vio->u.splice.cui_flags = args->u.splice.via_flags;
1194 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1198 ll_cl_add(file, env, io);
1199 result = cl_io_loop(env, io);
1200 ll_cl_remove(file, env);
1202 if (args->via_io_subtype == IO_NORMAL)
1203 up_read(&lli->lli_trunc_sem);
1205 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1207 range_unlock(&lli->lli_write_tree, &range);
1210 /* cl_io_rw_init() handled IO */
1211 result = io->ci_result;
1214 if (io->ci_nob > 0) {
1215 result = io->ci_nob;
1216 *ppos = io->u.ci_wr.wr.crw_pos;
1220 cl_io_fini(env, io);
1221 /* If any bit been read/written (result != 0), we just return
1222 * short read/write instead of restart io. */
1223 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1224 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1225 iot == CIT_READ ? "read" : "write",
1226 file->f_dentry->d_name.name, *ppos, count);
1227 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1231 if (iot == CIT_READ) {
1233 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1234 LPROC_LL_READ_BYTES, result);
1235 } else if (iot == CIT_WRITE) {
1237 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1238 LPROC_LL_WRITE_BYTES, result);
1239 fd->fd_write_failed = false;
1240 } else if (result != -ERESTARTSYS) {
1241 fd->fd_write_failed = true;
1244 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1251 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1253 static int ll_file_get_iov_count(const struct iovec *iov,
1254 unsigned long *nr_segs, size_t *count)
1259 for (seg = 0; seg < *nr_segs; seg++) {
1260 const struct iovec *iv = &iov[seg];
1263 * If any segment has a negative length, or the cumulative
1264 * length ever wraps negative then return -EINVAL.
1267 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1269 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1274 cnt -= iv->iov_len; /* This segment is no good */
1281 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1282 unsigned long nr_segs, loff_t pos)
1285 struct vvp_io_args *args;
1291 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1295 env = cl_env_get(&refcheck);
1297 RETURN(PTR_ERR(env));
1299 args = vvp_env_args(env, IO_NORMAL);
1300 args->u.normal.via_iov = (struct iovec *)iov;
1301 args->u.normal.via_nrsegs = nr_segs;
1302 args->u.normal.via_iocb = iocb;
1304 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1305 &iocb->ki_pos, count);
1306 cl_env_put(env, &refcheck);
1310 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1314 struct iovec *local_iov;
1315 struct kiocb *kiocb;
1320 env = cl_env_get(&refcheck);
1322 RETURN(PTR_ERR(env));
1324 local_iov = &vvp_env_info(env)->vti_local_iov;
1325 kiocb = &vvp_env_info(env)->vti_kiocb;
1326 local_iov->iov_base = (void __user *)buf;
1327 local_iov->iov_len = count;
1328 init_sync_kiocb(kiocb, file);
1329 kiocb->ki_pos = *ppos;
1330 #ifdef HAVE_KIOCB_KI_LEFT
1331 kiocb->ki_left = count;
1333 kiocb->ki_nbytes = count;
1336 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1337 *ppos = kiocb->ki_pos;
1339 cl_env_put(env, &refcheck);
1344 * Write to a file (through the page cache).
1347 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1348 unsigned long nr_segs, loff_t pos)
1351 struct vvp_io_args *args;
1357 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1361 env = cl_env_get(&refcheck);
1363 RETURN(PTR_ERR(env));
1365 args = vvp_env_args(env, IO_NORMAL);
1366 args->u.normal.via_iov = (struct iovec *)iov;
1367 args->u.normal.via_nrsegs = nr_segs;
1368 args->u.normal.via_iocb = iocb;
1370 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1371 &iocb->ki_pos, count);
1372 cl_env_put(env, &refcheck);
1376 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1377 size_t count, loff_t *ppos)
1380 struct iovec *local_iov;
1381 struct kiocb *kiocb;
1386 env = cl_env_get(&refcheck);
1388 RETURN(PTR_ERR(env));
1390 local_iov = &vvp_env_info(env)->vti_local_iov;
1391 kiocb = &vvp_env_info(env)->vti_kiocb;
1392 local_iov->iov_base = (void __user *)buf;
1393 local_iov->iov_len = count;
1394 init_sync_kiocb(kiocb, file);
1395 kiocb->ki_pos = *ppos;
1396 #ifdef HAVE_KIOCB_KI_LEFT
1397 kiocb->ki_left = count;
1399 kiocb->ki_nbytes = count;
1402 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1403 *ppos = kiocb->ki_pos;
1405 cl_env_put(env, &refcheck);
1410 * Send file content (through pagecache) somewhere with helper
1412 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1413 struct pipe_inode_info *pipe, size_t count,
1417 struct vvp_io_args *args;
1422 env = cl_env_get(&refcheck);
1424 RETURN(PTR_ERR(env));
1426 args = vvp_env_args(env, IO_SPLICE);
1427 args->u.splice.via_pipe = pipe;
1428 args->u.splice.via_flags = flags;
1430 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1431 cl_env_put(env, &refcheck);
1435 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1438 struct obd_export *exp = ll_i2dtexp(inode);
1439 struct obd_trans_info oti = { 0 };
1440 struct obdo *oa = NULL;
1443 struct lov_stripe_md *lsm = NULL, *lsm2;
1450 lsm = ccc_inode_lsm_get(inode);
1451 if (!lsm_has_objects(lsm))
1452 GOTO(out, rc = -ENOENT);
1454 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1455 (lsm->lsm_stripe_count));
1457 OBD_ALLOC_LARGE(lsm2, lsm_size);
1459 GOTO(out, rc = -ENOMEM);
1462 oa->o_nlink = ost_idx;
1463 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1464 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1465 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1466 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1467 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1468 memcpy(lsm2, lsm, lsm_size);
1469 ll_inode_size_lock(inode);
1470 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1471 ll_inode_size_unlock(inode);
1473 OBD_FREE_LARGE(lsm2, lsm_size);
1476 ccc_inode_lsm_put(inode, lsm);
1481 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1483 struct ll_recreate_obj ucreat;
1487 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1490 if (copy_from_user(&ucreat, (struct ll_recreate_obj __user *)arg,
1494 ostid_set_seq_mdt0(&oi);
1495 ostid_set_id(&oi, ucreat.lrc_id);
1496 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1499 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1506 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1509 if (copy_from_user(&fid, (struct lu_fid __user *)arg, sizeof(fid)))
1512 fid_to_ostid(&fid, &oi);
1513 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1514 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1517 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1518 __u64 flags, struct lov_user_md *lum,
1521 struct lov_stripe_md *lsm = NULL;
1522 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1526 lsm = ccc_inode_lsm_get(inode);
1528 ccc_inode_lsm_put(inode, lsm);
1529 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1530 PFID(ll_inode2fid(inode)));
1531 GOTO(out, rc = -EEXIST);
1534 ll_inode_size_lock(inode);
1535 oit.it_flags |= MDS_OPEN_BY_FID;
1536 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1538 GOTO(out_unlock, rc);
1539 rc = oit.d.lustre.it_status;
1541 GOTO(out_req_free, rc);
1543 ll_release_openhandle(file->f_dentry, &oit);
1546 ll_inode_size_unlock(inode);
1547 ll_intent_release(&oit);
1548 ccc_inode_lsm_put(inode, lsm);
1550 cl_lov_delay_create_clear(&file->f_flags);
1553 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1557 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1558 struct lov_mds_md **lmmp, int *lmm_size,
1559 struct ptlrpc_request **request)
1561 struct ll_sb_info *sbi = ll_i2sbi(inode);
1562 struct mdt_body *body;
1563 struct lov_mds_md *lmm = NULL;
1564 struct ptlrpc_request *req = NULL;
1565 struct md_op_data *op_data;
1568 rc = ll_get_default_mdsize(sbi, &lmmsize);
1572 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1573 strlen(filename), lmmsize,
1574 LUSTRE_OPC_ANY, NULL);
1575 if (IS_ERR(op_data))
1576 RETURN(PTR_ERR(op_data));
1578 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1579 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1580 ll_finish_md_op_data(op_data);
1582 CDEBUG(D_INFO, "md_getattr_name failed "
1583 "on %s: rc %d\n", filename, rc);
1587 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1588 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1590 lmmsize = body->mbo_eadatasize;
1592 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1594 GOTO(out, rc = -ENODATA);
1597 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1598 LASSERT(lmm != NULL);
1600 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1601 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1602 GOTO(out, rc = -EPROTO);
1606 * This is coming from the MDS, so is probably in
1607 * little endian. We convert it to host endian before
1608 * passing it to userspace.
1610 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1613 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1614 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1617 /* if function called for directory - we should
1618 * avoid swab not existent lsm objects */
1619 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1620 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1621 if (S_ISREG(body->mbo_mode))
1622 lustre_swab_lov_user_md_objects(
1623 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1625 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1626 lustre_swab_lov_user_md_v3(
1627 (struct lov_user_md_v3 *)lmm);
1628 if (S_ISREG(body->mbo_mode))
1629 lustre_swab_lov_user_md_objects(
1630 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1637 *lmm_size = lmmsize;
1642 static int ll_lov_setea(struct inode *inode, struct file *file,
1645 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1646 struct lov_user_md *lump;
1647 int lum_size = sizeof(struct lov_user_md) +
1648 sizeof(struct lov_user_ost_data);
1652 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1655 OBD_ALLOC_LARGE(lump, lum_size);
1659 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1660 OBD_FREE_LARGE(lump, lum_size);
1664 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1666 OBD_FREE_LARGE(lump, lum_size);
1670 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1673 struct lov_user_md_v3 lumv3;
1674 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1675 struct lov_user_md_v1 __user *lumv1p =
1676 (struct lov_user_md_v1 __user *)arg;
1677 struct lov_user_md_v3 __user *lumv3p =
1678 (struct lov_user_md_v3 __user *)arg;
1680 __u64 flags = FMODE_WRITE;
1683 /* first try with v1 which is smaller than v3 */
1684 lum_size = sizeof(struct lov_user_md_v1);
1685 if (copy_from_user(lumv1, lumv1p, lum_size))
1688 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1689 lum_size = sizeof(struct lov_user_md_v3);
1690 if (copy_from_user(&lumv3, lumv3p, lum_size))
1694 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1696 struct lov_stripe_md *lsm;
1699 put_user(0, &lumv1p->lmm_stripe_count);
1701 ll_layout_refresh(inode, &gen);
1702 lsm = ccc_inode_lsm_get(inode);
1703 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1704 0, lsm, (void __user *)arg);
1705 ccc_inode_lsm_put(inode, lsm);
1710 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1712 struct lov_stripe_md *lsm;
1716 lsm = ccc_inode_lsm_get(inode);
1718 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1719 lsm, (void __user *)arg);
1720 ccc_inode_lsm_put(inode, lsm);
1725 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1727 struct ll_inode_info *lli = ll_i2info(inode);
1728 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1729 struct ccc_grouplock grouplock;
1733 if (ll_file_nolock(file))
1734 RETURN(-EOPNOTSUPP);
1736 spin_lock(&lli->lli_lock);
1737 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1738 CWARN("group lock already existed with gid %lu\n",
1739 fd->fd_grouplock.cg_gid);
1740 spin_unlock(&lli->lli_lock);
1743 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1744 spin_unlock(&lli->lli_lock);
1746 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1747 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1751 spin_lock(&lli->lli_lock);
1752 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1753 spin_unlock(&lli->lli_lock);
1754 CERROR("another thread just won the race\n");
1755 cl_put_grouplock(&grouplock);
1759 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1760 fd->fd_grouplock = grouplock;
1761 spin_unlock(&lli->lli_lock);
1763 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1767 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1769 struct ll_inode_info *lli = ll_i2info(inode);
1770 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1771 struct ccc_grouplock grouplock;
1774 spin_lock(&lli->lli_lock);
1775 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1776 spin_unlock(&lli->lli_lock);
1777 CWARN("no group lock held\n");
1780 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1782 if (fd->fd_grouplock.cg_gid != arg) {
1783 CWARN("group lock %lu doesn't match current id %lu\n",
1784 arg, fd->fd_grouplock.cg_gid);
1785 spin_unlock(&lli->lli_lock);
1789 grouplock = fd->fd_grouplock;
1790 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1791 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1792 spin_unlock(&lli->lli_lock);
1794 cl_put_grouplock(&grouplock);
1795 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1800 * Close inode open handle
1802 * \param dentry [in] dentry which contains the inode
1803 * \param it [in,out] intent which contains open info and result
1806 * \retval <0 failure
1808 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1810 struct inode *inode = dentry->d_inode;
1811 struct obd_client_handle *och;
1817 /* Root ? Do nothing. */
1818 if (dentry->d_inode->i_sb->s_root == dentry)
1821 /* No open handle to close? Move away */
1822 if (!it_disposition(it, DISP_OPEN_OPEN))
1825 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1827 OBD_ALLOC(och, sizeof(*och));
1829 GOTO(out, rc = -ENOMEM);
1831 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1833 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1836 /* this one is in place of ll_file_open */
1837 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1838 ptlrpc_req_finished(it->d.lustre.it_data);
1839 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1845 * Get size for inode for which FIEMAP mapping is requested.
1846 * Make the FIEMAP get_info call and returns the result.
1848 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1851 struct obd_export *exp = ll_i2dtexp(inode);
1852 struct lov_stripe_md *lsm = NULL;
1853 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1854 __u32 vallen = num_bytes;
1858 /* Checks for fiemap flags */
1859 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1860 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1864 /* Check for FIEMAP_FLAG_SYNC */
1865 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1866 rc = filemap_fdatawrite(inode->i_mapping);
1871 lsm = ccc_inode_lsm_get(inode);
1875 /* If the stripe_count > 1 and the application does not understand
1876 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1878 if (lsm->lsm_stripe_count > 1 &&
1879 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1880 GOTO(out, rc = -EOPNOTSUPP);
1882 fm_key.oa.o_oi = lsm->lsm_oi;
1883 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1885 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1886 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1887 /* If filesize is 0, then there would be no objects for mapping */
1888 if (fm_key.oa.o_size == 0) {
1889 fiemap->fm_mapped_extents = 0;
1893 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1895 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1898 CERROR("obd_get_info failed: rc = %d\n", rc);
1901 ccc_inode_lsm_put(inode, lsm);
1905 int ll_fid2path(struct inode *inode, void __user *arg)
1907 struct obd_export *exp = ll_i2mdexp(inode);
1908 const struct getinfo_fid2path __user *gfin = arg;
1910 struct getinfo_fid2path *gfout;
1916 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1917 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1920 /* Only need to get the buflen */
1921 if (get_user(pathlen, &gfin->gf_pathlen))
1924 if (pathlen > PATH_MAX)
1927 outsize = sizeof(*gfout) + pathlen;
1928 OBD_ALLOC(gfout, outsize);
1932 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1933 GOTO(gf_free, rc = -EFAULT);
1935 /* Call mdc_iocontrol */
1936 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1940 if (copy_to_user(arg, gfout, outsize))
1944 OBD_FREE(gfout, outsize);
1948 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1950 struct ll_user_fiemap *fiemap_s;
1951 size_t num_bytes, ret_bytes;
1952 unsigned int extent_count;
1955 /* Get the extent count so we can calculate the size of
1956 * required fiemap buffer */
1957 if (get_user(extent_count,
1958 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1962 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1964 num_bytes = sizeof(*fiemap_s) + (extent_count *
1965 sizeof(struct ll_fiemap_extent));
1967 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1968 if (fiemap_s == NULL)
1971 /* get the fiemap value */
1972 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1974 GOTO(error, rc = -EFAULT);
1976 /* If fm_extent_count is non-zero, read the first extent since
1977 * it is used to calculate end_offset and device from previous
1980 if (copy_from_user(&fiemap_s->fm_extents[0],
1981 (char __user *)arg + sizeof(*fiemap_s),
1982 sizeof(struct ll_fiemap_extent)))
1983 GOTO(error, rc = -EFAULT);
1986 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1990 ret_bytes = sizeof(struct ll_user_fiemap);
1992 if (extent_count != 0)
1993 ret_bytes += (fiemap_s->fm_mapped_extents *
1994 sizeof(struct ll_fiemap_extent));
1996 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
2000 OBD_FREE_LARGE(fiemap_s, num_bytes);
2005 * Read the data_version for inode.
2007 * This value is computed using stripe object version on OST.
2008 * Version is computed using server side locking.
2010 * @param sync if do sync on the OST side;
2012 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2013 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2015 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2017 struct lov_stripe_md *lsm = NULL;
2018 struct ll_sb_info *sbi = ll_i2sbi(inode);
2019 struct obdo *obdo = NULL;
2023 /* If no stripe, we consider version is 0. */
2024 lsm = ccc_inode_lsm_get(inode);
2025 if (!lsm_has_objects(lsm)) {
2027 CDEBUG(D_INODE, "No object for inode\n");
2031 OBD_ALLOC_PTR(obdo);
2033 GOTO(out, rc = -ENOMEM);
2035 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2037 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2040 *data_version = obdo->o_data_version;
2046 ccc_inode_lsm_put(inode, lsm);
2051 * Trigger a HSM release request for the provided inode.
2053 int ll_hsm_release(struct inode *inode)
2055 struct cl_env_nest nest;
2057 struct obd_client_handle *och = NULL;
2058 __u64 data_version = 0;
2062 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2063 ll_get_fsname(inode->i_sb, NULL, 0),
2064 PFID(&ll_i2info(inode)->lli_fid));
2066 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2068 GOTO(out, rc = PTR_ERR(och));
2070 /* Grab latest data_version and [am]time values */
2071 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2075 env = cl_env_nested_get(&nest);
2077 GOTO(out, rc = PTR_ERR(env));
2079 ll_merge_lvb(env, inode);
2080 cl_env_nested_put(&nest, env);
2082 /* Release the file.
2083 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2084 * we still need it to pack l_remote_handle to MDT. */
2085 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2091 if (och != NULL && !IS_ERR(och)) /* close the file */
2092 ll_lease_close(och, inode, NULL);
2097 struct ll_swap_stack {
2098 struct iattr ia1, ia2;
2100 struct inode *inode1, *inode2;
2101 bool check_dv1, check_dv2;
2104 static int ll_swap_layouts(struct file *file1, struct file *file2,
2105 struct lustre_swap_layouts *lsl)
2107 struct mdc_swap_layouts msl;
2108 struct md_op_data *op_data;
2111 struct ll_swap_stack *llss = NULL;
2114 OBD_ALLOC_PTR(llss);
2118 llss->inode1 = file1->f_dentry->d_inode;
2119 llss->inode2 = file2->f_dentry->d_inode;
2121 if (!S_ISREG(llss->inode2->i_mode))
2122 GOTO(free, rc = -EINVAL);
2124 if (inode_permission(llss->inode1, MAY_WRITE) ||
2125 inode_permission(llss->inode2, MAY_WRITE))
2126 GOTO(free, rc = -EPERM);
2128 if (llss->inode2->i_sb != llss->inode1->i_sb)
2129 GOTO(free, rc = -EXDEV);
2131 /* we use 2 bool because it is easier to swap than 2 bits */
2132 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2133 llss->check_dv1 = true;
2135 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2136 llss->check_dv2 = true;
2138 /* we cannot use lsl->sl_dvX directly because we may swap them */
2139 llss->dv1 = lsl->sl_dv1;
2140 llss->dv2 = lsl->sl_dv2;
2142 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2143 if (rc == 0) /* same file, done! */
2146 if (rc < 0) { /* sequentialize it */
2147 swap(llss->inode1, llss->inode2);
2149 swap(llss->dv1, llss->dv2);
2150 swap(llss->check_dv1, llss->check_dv2);
2154 if (gid != 0) { /* application asks to flush dirty cache */
2155 rc = ll_get_grouplock(llss->inode1, file1, gid);
2159 rc = ll_get_grouplock(llss->inode2, file2, gid);
2161 ll_put_grouplock(llss->inode1, file1, gid);
2166 /* to be able to restore mtime and atime after swap
2167 * we need to first save them */
2169 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2170 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2171 llss->ia1.ia_atime = llss->inode1->i_atime;
2172 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2173 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2174 llss->ia2.ia_atime = llss->inode2->i_atime;
2175 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2178 /* ultimate check, before swaping the layouts we check if
2179 * dataversion has changed (if requested) */
2180 if (llss->check_dv1) {
2181 rc = ll_data_version(llss->inode1, &dv, 0);
2184 if (dv != llss->dv1)
2185 GOTO(putgl, rc = -EAGAIN);
2188 if (llss->check_dv2) {
2189 rc = ll_data_version(llss->inode2, &dv, 0);
2192 if (dv != llss->dv2)
2193 GOTO(putgl, rc = -EAGAIN);
2196 /* struct md_op_data is used to send the swap args to the mdt
2197 * only flags is missing, so we use struct mdc_swap_layouts
2198 * through the md_op_data->op_data */
2199 /* flags from user space have to be converted before they are send to
2200 * server, no flag is sent today, they are only used on the client */
2203 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2204 0, LUSTRE_OPC_ANY, &msl);
2205 if (IS_ERR(op_data))
2206 GOTO(free, rc = PTR_ERR(op_data));
2208 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2209 sizeof(*op_data), op_data, NULL);
2210 ll_finish_md_op_data(op_data);
2214 ll_put_grouplock(llss->inode2, file2, gid);
2215 ll_put_grouplock(llss->inode1, file1, gid);
2218 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2222 /* clear useless flags */
2223 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2224 llss->ia1.ia_valid &= ~ATTR_MTIME;
2225 llss->ia2.ia_valid &= ~ATTR_MTIME;
2228 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2229 llss->ia1.ia_valid &= ~ATTR_ATIME;
2230 llss->ia2.ia_valid &= ~ATTR_ATIME;
2233 /* update time if requested */
2235 if (llss->ia2.ia_valid != 0) {
2236 mutex_lock(&llss->inode1->i_mutex);
2237 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2238 mutex_unlock(&llss->inode1->i_mutex);
2241 if (llss->ia1.ia_valid != 0) {
2244 mutex_lock(&llss->inode2->i_mutex);
2245 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2246 mutex_unlock(&llss->inode2->i_mutex);
2258 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2260 struct md_op_data *op_data;
2263 /* Non-root users are forbidden to set or clear flags which are
2264 * NOT defined in HSM_USER_MASK. */
2265 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2266 !cfs_capable(CFS_CAP_SYS_ADMIN))
2269 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2270 LUSTRE_OPC_ANY, hss);
2271 if (IS_ERR(op_data))
2272 RETURN(PTR_ERR(op_data));
2274 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2275 sizeof(*op_data), op_data, NULL);
2277 ll_finish_md_op_data(op_data);
2282 static int ll_hsm_import(struct inode *inode, struct file *file,
2283 struct hsm_user_import *hui)
2285 struct hsm_state_set *hss = NULL;
2286 struct iattr *attr = NULL;
2290 if (!S_ISREG(inode->i_mode))
2296 GOTO(out, rc = -ENOMEM);
2298 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2299 hss->hss_archive_id = hui->hui_archive_id;
2300 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2301 rc = ll_hsm_state_set(inode, hss);
2305 OBD_ALLOC_PTR(attr);
2307 GOTO(out, rc = -ENOMEM);
2309 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2310 attr->ia_mode |= S_IFREG;
2311 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2312 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2313 attr->ia_size = hui->hui_size;
2314 attr->ia_mtime.tv_sec = hui->hui_mtime;
2315 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2316 attr->ia_atime.tv_sec = hui->hui_atime;
2317 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2319 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2320 ATTR_UID | ATTR_GID |
2321 ATTR_MTIME | ATTR_MTIME_SET |
2322 ATTR_ATIME | ATTR_ATIME_SET;
2324 mutex_lock(&inode->i_mutex);
2326 rc = ll_setattr_raw(file->f_dentry, attr, true);
2330 mutex_unlock(&inode->i_mutex);
2342 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2344 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2345 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2349 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2351 struct inode *inode = file->f_dentry->d_inode;
2352 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2356 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2357 PFID(ll_inode2fid(inode)), inode, cmd);
2358 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2360 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2361 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2365 case LL_IOC_GETFLAGS:
2366 /* Get the current value of the file flags */
2367 return put_user(fd->fd_flags, (int __user *)arg);
2368 case LL_IOC_SETFLAGS:
2369 case LL_IOC_CLRFLAGS:
2370 /* Set or clear specific file flags */
2371 /* XXX This probably needs checks to ensure the flags are
2372 * not abused, and to handle any flag side effects.
2374 if (get_user(flags, (int __user *) arg))
2377 if (cmd == LL_IOC_SETFLAGS) {
2378 if ((flags & LL_FILE_IGNORE_LOCK) &&
2379 !(file->f_flags & O_DIRECT)) {
2380 CERROR("%s: unable to disable locking on "
2381 "non-O_DIRECT file\n", current->comm);
2385 fd->fd_flags |= flags;
2387 fd->fd_flags &= ~flags;
2390 case LL_IOC_LOV_SETSTRIPE:
2391 RETURN(ll_lov_setstripe(inode, file, arg));
2392 case LL_IOC_LOV_SETEA:
2393 RETURN(ll_lov_setea(inode, file, arg));
2394 case LL_IOC_LOV_SWAP_LAYOUTS: {
2396 struct lustre_swap_layouts lsl;
2398 if (copy_from_user(&lsl, (char __user *)arg,
2399 sizeof(struct lustre_swap_layouts)))
2402 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2405 file2 = fget(lsl.sl_fd);
2410 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2411 rc = ll_swap_layouts(file, file2, &lsl);
2415 case LL_IOC_LOV_GETSTRIPE:
2416 RETURN(ll_lov_getstripe(inode, arg));
2417 case LL_IOC_RECREATE_OBJ:
2418 RETURN(ll_lov_recreate_obj(inode, arg));
2419 case LL_IOC_RECREATE_FID:
2420 RETURN(ll_lov_recreate_fid(inode, arg));
2421 case FSFILT_IOC_FIEMAP:
2422 RETURN(ll_ioctl_fiemap(inode, arg));
2423 case FSFILT_IOC_GETFLAGS:
2424 case FSFILT_IOC_SETFLAGS:
2425 RETURN(ll_iocontrol(inode, file, cmd, arg));
2426 case FSFILT_IOC_GETVERSION_OLD:
2427 case FSFILT_IOC_GETVERSION:
2428 RETURN(put_user(inode->i_generation, (int __user *)arg));
2429 case LL_IOC_GROUP_LOCK:
2430 RETURN(ll_get_grouplock(inode, file, arg));
2431 case LL_IOC_GROUP_UNLOCK:
2432 RETURN(ll_put_grouplock(inode, file, arg));
2433 case IOC_OBD_STATFS:
2434 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2436 /* We need to special case any other ioctls we want to handle,
2437 * to send them to the MDS/OST as appropriate and to properly
2438 * network encode the arg field.
2439 case FSFILT_IOC_SETVERSION_OLD:
2440 case FSFILT_IOC_SETVERSION:
2442 case LL_IOC_FLUSHCTX:
2443 RETURN(ll_flush_ctx(inode));
2444 case LL_IOC_PATH2FID: {
2445 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2446 sizeof(struct lu_fid)))
2451 case LL_IOC_GETPARENT:
2452 RETURN(ll_getparent(file, (void __user *)arg));
2454 case OBD_IOC_FID2PATH:
2455 RETURN(ll_fid2path(inode, (void __user *)arg));
2456 case LL_IOC_DATA_VERSION: {
2457 struct ioc_data_version idv;
2460 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2463 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2464 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2467 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2473 case LL_IOC_GET_MDTIDX: {
2476 mdtidx = ll_get_mdt_idx(inode);
2480 if (put_user((int)mdtidx, (int __user *)arg))
2485 case OBD_IOC_GETDTNAME:
2486 case OBD_IOC_GETMDNAME:
2487 RETURN(ll_get_obd_name(inode, cmd, arg));
2488 case LL_IOC_HSM_STATE_GET: {
2489 struct md_op_data *op_data;
2490 struct hsm_user_state *hus;
2497 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2498 LUSTRE_OPC_ANY, hus);
2499 if (IS_ERR(op_data)) {
2501 RETURN(PTR_ERR(op_data));
2504 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2507 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2510 ll_finish_md_op_data(op_data);
2514 case LL_IOC_HSM_STATE_SET: {
2515 struct hsm_state_set *hss;
2522 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2527 rc = ll_hsm_state_set(inode, hss);
2532 case LL_IOC_HSM_ACTION: {
2533 struct md_op_data *op_data;
2534 struct hsm_current_action *hca;
2541 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2542 LUSTRE_OPC_ANY, hca);
2543 if (IS_ERR(op_data)) {
2545 RETURN(PTR_ERR(op_data));
2548 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2551 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2554 ll_finish_md_op_data(op_data);
2558 case LL_IOC_SET_LEASE: {
2559 struct ll_inode_info *lli = ll_i2info(inode);
2560 struct obd_client_handle *och = NULL;
2565 case LL_LEASE_WRLCK:
2566 if (!(file->f_mode & FMODE_WRITE))
2568 fmode = FMODE_WRITE;
2570 case LL_LEASE_RDLCK:
2571 if (!(file->f_mode & FMODE_READ))
2575 case LL_LEASE_UNLCK:
2576 mutex_lock(&lli->lli_och_mutex);
2577 if (fd->fd_lease_och != NULL) {
2578 och = fd->fd_lease_och;
2579 fd->fd_lease_och = NULL;
2581 mutex_unlock(&lli->lli_och_mutex);
2586 fmode = och->och_flags;
2587 rc = ll_lease_close(och, inode, &lease_broken);
2594 RETURN(ll_lease_type_from_fmode(fmode));
2599 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2601 /* apply for lease */
2602 och = ll_lease_open(inode, file, fmode, 0);
2604 RETURN(PTR_ERR(och));
2607 mutex_lock(&lli->lli_och_mutex);
2608 if (fd->fd_lease_och == NULL) {
2609 fd->fd_lease_och = och;
2612 mutex_unlock(&lli->lli_och_mutex);
2614 /* impossible now that only excl is supported for now */
2615 ll_lease_close(och, inode, &lease_broken);
2620 case LL_IOC_GET_LEASE: {
2621 struct ll_inode_info *lli = ll_i2info(inode);
2622 struct ldlm_lock *lock = NULL;
2625 mutex_lock(&lli->lli_och_mutex);
2626 if (fd->fd_lease_och != NULL) {
2627 struct obd_client_handle *och = fd->fd_lease_och;
2629 lock = ldlm_handle2lock(&och->och_lease_handle);
2631 lock_res_and_lock(lock);
2632 if (!ldlm_is_cancel(lock))
2633 fmode = och->och_flags;
2635 unlock_res_and_lock(lock);
2636 LDLM_LOCK_PUT(lock);
2639 mutex_unlock(&lli->lli_och_mutex);
2641 RETURN(ll_lease_type_from_fmode(fmode));
2643 case LL_IOC_HSM_IMPORT: {
2644 struct hsm_user_import *hui;
2650 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2655 rc = ll_hsm_import(inode, file, hui);
2665 ll_iocontrol_call(inode, file, cmd, arg, &err))
2668 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2669 (void __user *)arg));
2674 #ifndef HAVE_FILE_LLSEEK_SIZE
2675 static inline loff_t
2676 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2678 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2680 if (offset > maxsize)
2683 if (offset != file->f_pos) {
2684 file->f_pos = offset;
2685 file->f_version = 0;
2691 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2692 loff_t maxsize, loff_t eof)
2694 struct inode *inode = file->f_dentry->d_inode;
2702 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2703 * position-querying operation. Avoid rewriting the "same"
2704 * f_pos value back to the file because a concurrent read(),
2705 * write() or lseek() might have altered it
2710 * f_lock protects against read/modify/write race with other
2711 * SEEK_CURs. Note that parallel writes and reads behave
2714 mutex_lock(&inode->i_mutex);
2715 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2716 mutex_unlock(&inode->i_mutex);
2720 * In the generic case the entire file is data, so as long as
2721 * offset isn't at the end of the file then the offset is data.
2728 * There is a virtual hole at the end of the file, so as long as
2729 * offset isn't i_size or larger, return i_size.
2737 return llseek_execute(file, offset, maxsize);
2741 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2743 struct inode *inode = file->f_dentry->d_inode;
2744 loff_t retval, eof = 0;
2747 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2748 (origin == SEEK_CUR) ? file->f_pos : 0);
2749 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2750 PFID(ll_inode2fid(inode)), inode, retval, retval,
2752 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2754 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2755 retval = ll_glimpse_size(inode);
2758 eof = i_size_read(inode);
2761 retval = ll_generic_file_llseek_size(file, offset, origin,
2762 ll_file_maxbytes(inode), eof);
2766 static int ll_flush(struct file *file, fl_owner_t id)
2768 struct inode *inode = file->f_dentry->d_inode;
2769 struct ll_inode_info *lli = ll_i2info(inode);
2770 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2773 LASSERT(!S_ISDIR(inode->i_mode));
2775 /* catch async errors that were recorded back when async writeback
2776 * failed for pages in this mapping. */
2777 rc = lli->lli_async_rc;
2778 lli->lli_async_rc = 0;
2779 if (lli->lli_clob != NULL) {
2780 err = lov_read_and_clear_async_rc(lli->lli_clob);
2785 /* The application has been told write failure already.
2786 * Do not report failure again. */
2787 if (fd->fd_write_failed)
2789 return rc ? -EIO : 0;
2793 * Called to make sure a portion of file has been written out.
2794 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2796 * Return how many pages have been written.
2798 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2799 enum cl_fsync_mode mode, int ignore_layout)
2801 struct cl_env_nest nest;
2804 struct obd_capa *capa = NULL;
2805 struct cl_fsync_io *fio;
2809 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2810 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2813 env = cl_env_nested_get(&nest);
2815 RETURN(PTR_ERR(env));
2817 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2819 io = ccc_env_thread_io(env);
2820 io->ci_obj = cl_i2info(inode)->lli_clob;
2821 io->ci_ignore_layout = ignore_layout;
2823 /* initialize parameters for sync */
2824 fio = &io->u.ci_fsync;
2825 fio->fi_capa = capa;
2826 fio->fi_start = start;
2828 fio->fi_fid = ll_inode2fid(inode);
2829 fio->fi_mode = mode;
2830 fio->fi_nr_written = 0;
2832 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2833 result = cl_io_loop(env, io);
2835 result = io->ci_result;
2837 result = fio->fi_nr_written;
2838 cl_io_fini(env, io);
2839 cl_env_nested_put(&nest, env);
2847 * When dentry is provided (the 'else' case), *file->f_dentry may be
2848 * null and dentry must be used directly rather than pulled from
2849 * *file->f_dentry as is done otherwise.
2852 #ifdef HAVE_FILE_FSYNC_4ARGS
2853 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2855 struct dentry *dentry = file->f_dentry;
2856 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2857 int ll_fsync(struct file *file, int datasync)
2859 struct dentry *dentry = file->f_dentry;
2861 loff_t end = LLONG_MAX;
2863 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2866 loff_t end = LLONG_MAX;
2868 struct inode *inode = dentry->d_inode;
2869 struct ll_inode_info *lli = ll_i2info(inode);
2870 struct ptlrpc_request *req;
2871 struct obd_capa *oc;
2875 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2876 PFID(ll_inode2fid(inode)), inode);
2877 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2879 #ifdef HAVE_FILE_FSYNC_4ARGS
2880 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2881 mutex_lock(&inode->i_mutex);
2883 /* fsync's caller has already called _fdata{sync,write}, we want
2884 * that IO to finish before calling the osc and mdc sync methods */
2885 rc = filemap_fdatawait(inode->i_mapping);
2888 /* catch async errors that were recorded back when async writeback
2889 * failed for pages in this mapping. */
2890 if (!S_ISDIR(inode->i_mode)) {
2891 err = lli->lli_async_rc;
2892 lli->lli_async_rc = 0;
2895 err = lov_read_and_clear_async_rc(lli->lli_clob);
2900 oc = ll_mdscapa_get(inode);
2901 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2907 ptlrpc_req_finished(req);
2909 if (S_ISREG(inode->i_mode)) {
2910 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2912 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2913 if (rc == 0 && err < 0)
2916 fd->fd_write_failed = true;
2918 fd->fd_write_failed = false;
2921 #ifdef HAVE_FILE_FSYNC_4ARGS
2922 mutex_unlock(&inode->i_mutex);
2928 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2930 struct inode *inode = file->f_dentry->d_inode;
2931 struct ll_sb_info *sbi = ll_i2sbi(inode);
2932 struct ldlm_enqueue_info einfo = {
2933 .ei_type = LDLM_FLOCK,
2934 .ei_cb_cp = ldlm_flock_completion_ast,
2935 .ei_cbdata = file_lock,
2937 struct md_op_data *op_data;
2938 struct lustre_handle lockh = {0};
2939 ldlm_policy_data_t flock = {{0}};
2940 int fl_type = file_lock->fl_type;
2946 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2947 PFID(ll_inode2fid(inode)), file_lock);
2949 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2951 if (file_lock->fl_flags & FL_FLOCK) {
2952 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2953 /* flocks are whole-file locks */
2954 flock.l_flock.end = OFFSET_MAX;
2955 /* For flocks owner is determined by the local file desctiptor*/
2956 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2957 } else if (file_lock->fl_flags & FL_POSIX) {
2958 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2959 flock.l_flock.start = file_lock->fl_start;
2960 flock.l_flock.end = file_lock->fl_end;
2964 flock.l_flock.pid = file_lock->fl_pid;
2966 /* Somewhat ugly workaround for svc lockd.
2967 * lockd installs custom fl_lmops->lm_compare_owner that checks
2968 * for the fl_owner to be the same (which it always is on local node
2969 * I guess between lockd processes) and then compares pid.
2970 * As such we assign pid to the owner field to make it all work,
2971 * conflict with normal locks is unlikely since pid space and
2972 * pointer space for current->files are not intersecting */
2973 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2974 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2978 einfo.ei_mode = LCK_PR;
2981 /* An unlock request may or may not have any relation to
2982 * existing locks so we may not be able to pass a lock handle
2983 * via a normal ldlm_lock_cancel() request. The request may even
2984 * unlock a byte range in the middle of an existing lock. In
2985 * order to process an unlock request we need all of the same
2986 * information that is given with a normal read or write record
2987 * lock request. To avoid creating another ldlm unlock (cancel)
2988 * message we'll treat a LCK_NL flock request as an unlock. */
2989 einfo.ei_mode = LCK_NL;
2992 einfo.ei_mode = LCK_PW;
2995 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3010 flags = LDLM_FL_BLOCK_NOWAIT;
3016 flags = LDLM_FL_TEST_LOCK;
3019 CERROR("unknown fcntl lock command: %d\n", cmd);
3023 /* Save the old mode so that if the mode in the lock changes we
3024 * can decrement the appropriate reader or writer refcount. */
3025 file_lock->fl_type = einfo.ei_mode;
3027 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3028 LUSTRE_OPC_ANY, NULL);
3029 if (IS_ERR(op_data))
3030 RETURN(PTR_ERR(op_data));
3032 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3033 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3034 flock.l_flock.pid, flags, einfo.ei_mode,
3035 flock.l_flock.start, flock.l_flock.end);
3037 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3040 /* Restore the file lock type if not TEST lock. */
3041 if (!(flags & LDLM_FL_TEST_LOCK))
3042 file_lock->fl_type = fl_type;
3044 if ((file_lock->fl_flags & FL_FLOCK) &&
3045 (rc == 0 || file_lock->fl_type == F_UNLCK))
3046 rc2 = flock_lock_file_wait(file, file_lock);
3047 if ((file_lock->fl_flags & FL_POSIX) &&
3048 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3049 !(flags & LDLM_FL_TEST_LOCK))
3050 rc2 = posix_lock_file_wait(file, file_lock);
3052 if (rc2 && file_lock->fl_type != F_UNLCK) {
3053 einfo.ei_mode = LCK_NL;
3054 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3059 ll_finish_md_op_data(op_data);
3064 int ll_get_fid_by_name(struct inode *parent, const char *name,
3065 int namelen, struct lu_fid *fid)
3067 struct md_op_data *op_data = NULL;
3068 struct mdt_body *body;
3069 struct ptlrpc_request *req;
3073 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3074 LUSTRE_OPC_ANY, NULL);
3075 if (IS_ERR(op_data))
3076 RETURN(PTR_ERR(op_data));
3078 op_data->op_valid = OBD_MD_FLID;
3079 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3080 ll_finish_md_op_data(op_data);
3084 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3086 GOTO(out_req, rc = -EFAULT);
3088 *fid = body->mbo_fid1;
3090 ptlrpc_req_finished(req);
3094 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3095 const char *name, int namelen)
3097 struct dentry *dchild = NULL;
3098 struct inode *child_inode = NULL;
3099 struct md_op_data *op_data;
3100 struct ptlrpc_request *request = NULL;
3105 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3106 name, PFID(ll_inode2fid(parent)), mdtidx);
3108 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3109 0, LUSTRE_OPC_ANY, NULL);
3110 if (IS_ERR(op_data))
3111 RETURN(PTR_ERR(op_data));
3113 /* Get child FID first */
3114 qstr.hash = full_name_hash(name, namelen);
3117 dchild = d_lookup(file->f_dentry, &qstr);
3118 if (dchild != NULL && dchild->d_inode != NULL) {
3119 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3120 if (dchild->d_inode != NULL) {
3121 child_inode = igrab(dchild->d_inode);
3122 ll_invalidate_aliases(child_inode);
3126 rc = ll_get_fid_by_name(parent, name, namelen,
3132 if (!fid_is_sane(&op_data->op_fid3)) {
3133 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3134 ll_get_fsname(parent->i_sb, NULL, 0), name,
3135 PFID(&op_data->op_fid3));
3136 GOTO(out_free, rc = -EINVAL);
3139 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3144 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3145 PFID(&op_data->op_fid3), mdtidx);
3146 GOTO(out_free, rc = 0);
3149 op_data->op_mds = mdtidx;
3150 op_data->op_cli_flags = CLI_MIGRATE;
3151 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3152 namelen, name, namelen, &request);
3154 ll_update_times(request, parent);
3156 ptlrpc_req_finished(request);
3161 if (child_inode != NULL) {
3162 clear_nlink(child_inode);
3166 ll_finish_md_op_data(op_data);
3171 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3179 * test if some locks matching bits and l_req_mode are acquired
3180 * - bits can be in different locks
3181 * - if found clear the common lock bits in *bits
3182 * - the bits not found, are kept in *bits
3184 * \param bits [IN] searched lock bits [IN]
3185 * \param l_req_mode [IN] searched lock mode
3186 * \retval boolean, true iff all bits are found
3188 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3190 struct lustre_handle lockh;
3191 ldlm_policy_data_t policy;
3192 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3193 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3202 fid = &ll_i2info(inode)->lli_fid;
3203 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3204 ldlm_lockname[mode]);
3206 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3207 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3208 policy.l_inodebits.bits = *bits & (1 << i);
3209 if (policy.l_inodebits.bits == 0)
3212 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3213 &policy, mode, &lockh)) {
3214 struct ldlm_lock *lock;
3216 lock = ldlm_handle2lock(&lockh);
3219 ~(lock->l_policy_data.l_inodebits.bits);
3220 LDLM_LOCK_PUT(lock);
3222 *bits &= ~policy.l_inodebits.bits;
3229 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3230 struct lustre_handle *lockh, __u64 flags,
3233 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3238 fid = &ll_i2info(inode)->lli_fid;
3239 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3241 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3242 fid, LDLM_IBITS, &policy, mode, lockh);
3247 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3249 /* Already unlinked. Just update nlink and return success */
3250 if (rc == -ENOENT) {
3252 /* This path cannot be hit for regular files unless in
3253 * case of obscure races, so no need to to validate
3255 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3257 } else if (rc != 0) {
3258 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3259 "%s: revalidate FID "DFID" error: rc = %d\n",
3260 ll_get_fsname(inode->i_sb, NULL, 0),
3261 PFID(ll_inode2fid(inode)), rc);
3267 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3269 struct inode *inode = dentry->d_inode;
3270 struct ptlrpc_request *req = NULL;
3271 struct obd_export *exp;
3275 LASSERT(inode != NULL);
3277 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3278 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3280 exp = ll_i2mdexp(inode);
3282 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3283 * But under CMD case, it caused some lock issues, should be fixed
3284 * with new CMD ibits lock. See bug 12718 */
3285 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3286 struct lookup_intent oit = { .it_op = IT_GETATTR };
3287 struct md_op_data *op_data;
3289 if (ibits == MDS_INODELOCK_LOOKUP)
3290 oit.it_op = IT_LOOKUP;
3292 /* Call getattr by fid, so do not provide name at all. */
3293 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3294 dentry->d_inode, NULL, 0, 0,
3295 LUSTRE_OPC_ANY, NULL);
3296 if (IS_ERR(op_data))
3297 RETURN(PTR_ERR(op_data));
3299 rc = md_intent_lock(exp, op_data, &oit, &req,
3300 &ll_md_blocking_ast, 0);
3301 ll_finish_md_op_data(op_data);
3303 rc = ll_inode_revalidate_fini(inode, rc);
3307 rc = ll_revalidate_it_finish(req, &oit, dentry);
3309 ll_intent_release(&oit);
3313 /* Unlinked? Unhash dentry, so it is not picked up later by
3314 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3315 here to preserve get_cwd functionality on 2.6.
3317 if (!dentry->d_inode->i_nlink)
3318 d_lustre_invalidate(dentry, 0);
3320 ll_lookup_finish_locks(&oit, dentry);
3321 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3322 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3323 obd_valid valid = OBD_MD_FLGETATTR;
3324 struct md_op_data *op_data;
3327 if (S_ISREG(inode->i_mode)) {
3328 rc = ll_get_default_mdsize(sbi, &ealen);
3331 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3334 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3335 0, ealen, LUSTRE_OPC_ANY,
3337 if (IS_ERR(op_data))
3338 RETURN(PTR_ERR(op_data));
3340 op_data->op_valid = valid;
3341 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3342 * capa for this inode. Because we only keep capas of dirs
3344 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3345 ll_finish_md_op_data(op_data);
3347 rc = ll_inode_revalidate_fini(inode, rc);
3351 rc = ll_prep_inode(&inode, req, NULL, NULL);
3354 ptlrpc_req_finished(req);
3358 static int ll_merge_md_attr(struct inode *inode)
3360 struct cl_attr attr = { 0 };
3363 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3364 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3369 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3370 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3372 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3373 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3374 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3380 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3382 struct inode *inode = dentry->d_inode;
3386 rc = __ll_inode_revalidate(dentry, ibits);
3390 /* if object isn't regular file, don't validate size */
3391 if (!S_ISREG(inode->i_mode)) {
3392 if (S_ISDIR(inode->i_mode) &&
3393 ll_i2info(inode)->lli_lsm_md != NULL) {
3394 rc = ll_merge_md_attr(inode);
3399 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3400 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3401 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3403 /* In case of restore, the MDT has the right size and has
3404 * already send it back without granting the layout lock,
3405 * inode is up-to-date so glimpse is useless.
3406 * Also to glimpse we need the layout, in case of a running
3407 * restore the MDT holds the layout lock so the glimpse will
3408 * block up to the end of restore (getattr will block)
3410 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3411 rc = ll_glimpse_size(inode);
3416 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3418 struct inode *inode = de->d_inode;
3419 struct ll_sb_info *sbi = ll_i2sbi(inode);
3420 struct ll_inode_info *lli = ll_i2info(inode);
3423 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3424 MDS_INODELOCK_LOOKUP);
3425 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3430 stat->dev = inode->i_sb->s_dev;
3431 if (ll_need_32bit_api(sbi))
3432 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3434 stat->ino = inode->i_ino;
3435 stat->mode = inode->i_mode;
3436 stat->uid = inode->i_uid;
3437 stat->gid = inode->i_gid;
3438 stat->rdev = inode->i_rdev;
3439 stat->atime = inode->i_atime;
3440 stat->mtime = inode->i_mtime;
3441 stat->ctime = inode->i_ctime;
3442 stat->blksize = 1 << inode->i_blkbits;
3443 stat->blocks = inode->i_blocks;
3445 if (S_ISDIR(inode->i_mode) &&
3446 ll_i2info(inode)->lli_lsm_md != NULL) {
3447 stat->nlink = lli->lli_stripe_dir_nlink;
3448 stat->size = lli->lli_stripe_dir_size;
3450 stat->nlink = inode->i_nlink;
3451 stat->size = i_size_read(inode);
3457 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3458 __u64 start, __u64 len)
3462 struct ll_user_fiemap *fiemap;
3463 unsigned int extent_count = fieinfo->fi_extents_max;
3465 num_bytes = sizeof(*fiemap) + (extent_count *
3466 sizeof(struct ll_fiemap_extent));
3467 OBD_ALLOC_LARGE(fiemap, num_bytes);
3472 fiemap->fm_flags = fieinfo->fi_flags;
3473 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3474 fiemap->fm_start = start;
3475 fiemap->fm_length = len;
3476 if (extent_count > 0)
3477 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3478 sizeof(struct ll_fiemap_extent));
3480 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3482 fieinfo->fi_flags = fiemap->fm_flags;
3483 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3484 if (extent_count > 0)
3485 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3486 fiemap->fm_mapped_extents *
3487 sizeof(struct ll_fiemap_extent));
3489 OBD_FREE_LARGE(fiemap, num_bytes);
3493 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3495 struct ll_inode_info *lli = ll_i2info(inode);
3496 struct posix_acl *acl = NULL;
3499 spin_lock(&lli->lli_lock);
3500 /* VFS' acl_permission_check->check_acl will release the refcount */
3501 acl = posix_acl_dup(lli->lli_posix_acl);
3502 spin_unlock(&lli->lli_lock);
3507 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3509 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3510 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3512 ll_check_acl(struct inode *inode, int mask)
3515 # ifdef CONFIG_FS_POSIX_ACL
3516 struct posix_acl *acl;
3520 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3521 if (flags & IPERM_FLAG_RCU)
3524 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3529 rc = posix_acl_permission(inode, acl, mask);
3530 posix_acl_release(acl);
3533 # else /* !CONFIG_FS_POSIX_ACL */
3535 # endif /* CONFIG_FS_POSIX_ACL */
3537 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3539 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3540 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3542 # ifdef HAVE_INODE_PERMISION_2ARGS
3543 int ll_inode_permission(struct inode *inode, int mask)
3545 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3550 struct ll_sb_info *sbi;
3551 struct root_squash_info *squash;
3552 struct cred *cred = NULL;
3553 const struct cred *old_cred = NULL;
3555 bool squash_id = false;
3558 #ifdef MAY_NOT_BLOCK
3559 if (mask & MAY_NOT_BLOCK)
3561 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3562 if (flags & IPERM_FLAG_RCU)
3566 /* as root inode are NOT getting validated in lookup operation,
3567 * need to do it before permission check. */
3569 if (inode == inode->i_sb->s_root->d_inode) {
3570 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3571 MDS_INODELOCK_LOOKUP);
3576 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3577 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3579 /* squash fsuid/fsgid if needed */
3580 sbi = ll_i2sbi(inode);
3581 squash = &sbi->ll_squash;
3582 if (unlikely(squash->rsi_uid != 0 &&
3583 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3584 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3588 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3589 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3590 squash->rsi_uid, squash->rsi_gid);
3592 /* update current process's credentials
3593 * and FS capability */
3594 cred = prepare_creds();
3598 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3599 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3600 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3601 if ((1 << cap) & CFS_CAP_FS_MASK)
3602 cap_lower(cred->cap_effective, cap);
3604 old_cred = override_creds(cred);
3607 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3609 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3610 rc = lustre_check_remote_perm(inode, mask);
3612 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3614 /* restore current process's credentials and FS capability */
3616 revert_creds(old_cred);
3623 /* -o localflock - only provides locally consistent flock locks */
3624 struct file_operations ll_file_operations = {
3625 .read = ll_file_read,
3626 .aio_read = ll_file_aio_read,
3627 .write = ll_file_write,
3628 .aio_write = ll_file_aio_write,
3629 .unlocked_ioctl = ll_file_ioctl,
3630 .open = ll_file_open,
3631 .release = ll_file_release,
3632 .mmap = ll_file_mmap,
3633 .llseek = ll_file_seek,
3634 .splice_read = ll_file_splice_read,
3639 struct file_operations ll_file_operations_flock = {
3640 .read = ll_file_read,
3641 .aio_read = ll_file_aio_read,
3642 .write = ll_file_write,
3643 .aio_write = ll_file_aio_write,
3644 .unlocked_ioctl = ll_file_ioctl,
3645 .open = ll_file_open,
3646 .release = ll_file_release,
3647 .mmap = ll_file_mmap,
3648 .llseek = ll_file_seek,
3649 .splice_read = ll_file_splice_read,
3652 .flock = ll_file_flock,
3653 .lock = ll_file_flock
3656 /* These are for -o noflock - to return ENOSYS on flock calls */
3657 struct file_operations ll_file_operations_noflock = {
3658 .read = ll_file_read,
3659 .aio_read = ll_file_aio_read,
3660 .write = ll_file_write,
3661 .aio_write = ll_file_aio_write,
3662 .unlocked_ioctl = ll_file_ioctl,
3663 .open = ll_file_open,
3664 .release = ll_file_release,
3665 .mmap = ll_file_mmap,
3666 .llseek = ll_file_seek,
3667 .splice_read = ll_file_splice_read,
3670 .flock = ll_file_noflock,
3671 .lock = ll_file_noflock
3674 struct inode_operations ll_file_inode_operations = {
3675 .setattr = ll_setattr,
3676 .getattr = ll_getattr,
3677 .permission = ll_inode_permission,
3678 .setxattr = ll_setxattr,
3679 .getxattr = ll_getxattr,
3680 .listxattr = ll_listxattr,
3681 .removexattr = ll_removexattr,
3682 .fiemap = ll_fiemap,
3683 #ifdef HAVE_IOP_GET_ACL
3684 .get_acl = ll_get_acl,
3688 /* dynamic ioctl number support routins */
3689 static struct llioc_ctl_data {
3690 struct rw_semaphore ioc_sem;
3691 struct list_head ioc_head;
3693 __RWSEM_INITIALIZER(llioc.ioc_sem),
3694 LIST_HEAD_INIT(llioc.ioc_head)
3699 struct list_head iocd_list;
3700 unsigned int iocd_size;
3701 llioc_callback_t iocd_cb;
3702 unsigned int iocd_count;
3703 unsigned int iocd_cmd[0];
3706 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3709 struct llioc_data *in_data = NULL;
3712 if (cb == NULL || cmd == NULL ||
3713 count > LLIOC_MAX_CMD || count < 0)
3716 size = sizeof(*in_data) + count * sizeof(unsigned int);
3717 OBD_ALLOC(in_data, size);
3718 if (in_data == NULL)
3721 memset(in_data, 0, sizeof(*in_data));
3722 in_data->iocd_size = size;
3723 in_data->iocd_cb = cb;
3724 in_data->iocd_count = count;
3725 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3727 down_write(&llioc.ioc_sem);
3728 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3729 up_write(&llioc.ioc_sem);
3734 void ll_iocontrol_unregister(void *magic)
3736 struct llioc_data *tmp;
3741 down_write(&llioc.ioc_sem);
3742 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3744 unsigned int size = tmp->iocd_size;
3746 list_del(&tmp->iocd_list);
3747 up_write(&llioc.ioc_sem);
3749 OBD_FREE(tmp, size);
3753 up_write(&llioc.ioc_sem);
3755 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3758 EXPORT_SYMBOL(ll_iocontrol_register);
3759 EXPORT_SYMBOL(ll_iocontrol_unregister);
3761 static enum llioc_iter
3762 ll_iocontrol_call(struct inode *inode, struct file *file,
3763 unsigned int cmd, unsigned long arg, int *rcp)
3765 enum llioc_iter ret = LLIOC_CONT;
3766 struct llioc_data *data;
3767 int rc = -EINVAL, i;
3769 down_read(&llioc.ioc_sem);
3770 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3771 for (i = 0; i < data->iocd_count; i++) {
3772 if (cmd != data->iocd_cmd[i])
3775 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3779 if (ret == LLIOC_STOP)
3782 up_read(&llioc.ioc_sem);
3789 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3791 struct ll_inode_info *lli = ll_i2info(inode);
3792 struct cl_env_nest nest;
3797 if (lli->lli_clob == NULL)
3800 env = cl_env_nested_get(&nest);
3802 RETURN(PTR_ERR(env));
3804 result = cl_conf_set(env, lli->lli_clob, conf);
3805 cl_env_nested_put(&nest, env);
3807 if (conf->coc_opc == OBJECT_CONF_SET) {
3808 struct ldlm_lock *lock = conf->coc_lock;
3810 LASSERT(lock != NULL);
3811 LASSERT(ldlm_has_layout(lock));
3813 struct lustre_md *md = conf->u.coc_md;
3814 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3816 /* it can only be allowed to match after layout is
3817 * applied to inode otherwise false layout would be
3818 * seen. Applying layout shoud happen before dropping
3819 * the intent lock. */
3820 ldlm_lock_allow_match(lock);
3822 lli->lli_has_smd = lsm_has_objects(md->lsm);
3823 if (md->lsm != NULL)
3824 gen = md->lsm->lsm_layout_gen;
3827 DFID ": layout version change: %u -> %u\n",
3828 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3830 ll_layout_version_set(lli, gen);
3836 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3837 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3840 struct ll_sb_info *sbi = ll_i2sbi(inode);
3841 struct obd_capa *oc;
3842 struct ptlrpc_request *req;
3843 struct mdt_body *body;
3850 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3851 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3852 lock->l_lvb_data, lock->l_lvb_len);
3854 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3857 /* if layout lock was granted right away, the layout is returned
3858 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3859 * blocked and then granted via completion ast, we have to fetch
3860 * layout here. Please note that we can't use the LVB buffer in
3861 * completion AST because it doesn't have a large enough buffer */
3862 oc = ll_mdscapa_get(inode);
3863 rc = ll_get_default_mdsize(sbi, &lmmsize);
3865 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3866 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3872 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3874 GOTO(out, rc = -EPROTO);
3876 lmmsize = body->mbo_eadatasize;
3877 if (lmmsize == 0) /* empty layout */
3880 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3882 GOTO(out, rc = -EFAULT);
3884 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3885 if (lvbdata == NULL)
3886 GOTO(out, rc = -ENOMEM);
3888 memcpy(lvbdata, lmm, lmmsize);
3889 lock_res_and_lock(lock);
3890 if (lock->l_lvb_data != NULL)
3891 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3893 lock->l_lvb_data = lvbdata;
3894 lock->l_lvb_len = lmmsize;
3895 unlock_res_and_lock(lock);
3900 ptlrpc_req_finished(req);
3905 * Apply the layout to the inode. Layout lock is held and will be released
3908 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3909 struct inode *inode, __u32 *gen, bool reconf)
3911 struct ll_inode_info *lli = ll_i2info(inode);
3912 struct ll_sb_info *sbi = ll_i2sbi(inode);
3913 struct ldlm_lock *lock;
3914 struct lustre_md md = { NULL };
3915 struct cl_object_conf conf;
3918 bool wait_layout = false;
3921 LASSERT(lustre_handle_is_used(lockh));
3923 lock = ldlm_handle2lock(lockh);
3924 LASSERT(lock != NULL);
3925 LASSERT(ldlm_has_layout(lock));
3927 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3928 PFID(&lli->lli_fid), inode, reconf);
3930 /* in case this is a caching lock and reinstate with new inode */
3931 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3933 lock_res_and_lock(lock);
3934 lvb_ready = ldlm_is_lvb_ready(lock);
3935 unlock_res_and_lock(lock);
3936 /* checking lvb_ready is racy but this is okay. The worst case is
3937 * that multi processes may configure the file on the same time. */
3939 if (lvb_ready || !reconf) {
3942 /* layout_gen must be valid if layout lock is not
3943 * cancelled and stripe has already set */
3944 *gen = ll_layout_version_get(lli);
3950 rc = ll_layout_fetch(inode, lock);
3954 /* for layout lock, lmm is returned in lock's lvb.
3955 * lvb_data is immutable if the lock is held so it's safe to access it
3956 * without res lock. See the description in ldlm_lock_decref_internal()
3957 * for the condition to free lvb_data of layout lock */
3958 if (lock->l_lvb_data != NULL) {
3959 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3960 lock->l_lvb_data, lock->l_lvb_len);
3962 *gen = LL_LAYOUT_GEN_EMPTY;
3964 *gen = md.lsm->lsm_layout_gen;
3967 CERROR("%s: file "DFID" unpackmd error: %d\n",
3968 ll_get_fsname(inode->i_sb, NULL, 0),
3969 PFID(&lli->lli_fid), rc);
3975 /* set layout to file. Unlikely this will fail as old layout was
3976 * surely eliminated */
3977 memset(&conf, 0, sizeof conf);
3978 conf.coc_opc = OBJECT_CONF_SET;
3979 conf.coc_inode = inode;
3980 conf.coc_lock = lock;
3981 conf.u.coc_md = &md;
3982 rc = ll_layout_conf(inode, &conf);
3985 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3987 /* refresh layout failed, need to wait */
3988 wait_layout = rc == -EBUSY;
3992 LDLM_LOCK_PUT(lock);
3993 ldlm_lock_decref(lockh, mode);
3995 /* wait for IO to complete if it's still being used. */
3997 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3998 ll_get_fsname(inode->i_sb, NULL, 0),
3999 PFID(&lli->lli_fid), inode);
4001 memset(&conf, 0, sizeof conf);
4002 conf.coc_opc = OBJECT_CONF_WAIT;
4003 conf.coc_inode = inode;
4004 rc = ll_layout_conf(inode, &conf);
4008 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4009 ll_get_fsname(inode->i_sb, NULL, 0),
4010 PFID(&lli->lli_fid), rc);
4016 * This function checks if there exists a LAYOUT lock on the client side,
4017 * or enqueues it if it doesn't have one in cache.
4019 * This function will not hold layout lock so it may be revoked any time after
4020 * this function returns. Any operations depend on layout should be redone
4023 * This function should be called before lov_io_init() to get an uptodate
4024 * layout version, the caller should save the version number and after IO
4025 * is finished, this function should be called again to verify that layout
4026 * is not changed during IO time.
4028 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4030 struct ll_inode_info *lli = ll_i2info(inode);
4031 struct ll_sb_info *sbi = ll_i2sbi(inode);
4032 struct md_op_data *op_data;
4033 struct lookup_intent it;
4034 struct lustre_handle lockh;
4036 struct ldlm_enqueue_info einfo = {
4037 .ei_type = LDLM_IBITS,
4039 .ei_cb_bl = &ll_md_blocking_ast,
4040 .ei_cb_cp = &ldlm_completion_ast,
4045 *gen = ll_layout_version_get(lli);
4046 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4050 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4051 LASSERT(S_ISREG(inode->i_mode));
4053 /* take layout lock mutex to enqueue layout lock exclusively. */
4054 mutex_lock(&lli->lli_layout_mutex);
4057 /* mostly layout lock is caching on the local side, so try to match
4058 * it before grabbing layout lock mutex. */
4059 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4060 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4061 if (mode != 0) { /* hit cached lock */
4062 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4066 mutex_unlock(&lli->lli_layout_mutex);
4070 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4071 0, 0, LUSTRE_OPC_ANY, NULL);
4072 if (IS_ERR(op_data)) {
4073 mutex_unlock(&lli->lli_layout_mutex);
4074 RETURN(PTR_ERR(op_data));
4077 /* have to enqueue one */
4078 memset(&it, 0, sizeof(it));
4079 it.it_op = IT_LAYOUT;
4080 lockh.cookie = 0ULL;
4082 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4083 ll_get_fsname(inode->i_sb, NULL, 0),
4084 PFID(&lli->lli_fid), inode);
4086 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4087 if (it.d.lustre.it_data != NULL)
4088 ptlrpc_req_finished(it.d.lustre.it_data);
4089 it.d.lustre.it_data = NULL;
4091 ll_finish_md_op_data(op_data);
4093 mode = it.d.lustre.it_lock_mode;
4094 it.d.lustre.it_lock_mode = 0;
4095 ll_intent_drop_lock(&it);
4098 /* set lock data in case this is a new lock */
4099 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4100 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4104 mutex_unlock(&lli->lli_layout_mutex);
4110 * This function send a restore request to the MDT
4112 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4114 struct hsm_user_request *hur;
4118 len = sizeof(struct hsm_user_request) +
4119 sizeof(struct hsm_user_item);
4120 OBD_ALLOC(hur, len);
4124 hur->hur_request.hr_action = HUA_RESTORE;
4125 hur->hur_request.hr_archive_id = 0;
4126 hur->hur_request.hr_flags = 0;
4127 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4128 sizeof(hur->hur_user_item[0].hui_fid));
4129 hur->hur_user_item[0].hui_extent.offset = offset;
4130 hur->hur_user_item[0].hui_extent.length = length;
4131 hur->hur_request.hr_itemcount = 1;
4132 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,