4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <linux/pagemap.h>
46 #include <linux/file.h>
47 #include <linux/sched.h>
48 #include "llite_internal.h"
49 #include <lustre/ll_fiemap.h>
50 #include <lustre_ioctl.h>
52 #include "cl_object.h"
55 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
57 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
60 static enum llioc_iter
61 ll_iocontrol_call(struct inode *inode, struct file *file,
62 unsigned int cmd, unsigned long arg, int *rcp);
64 static struct ll_file_data *ll_file_data_get(void)
66 struct ll_file_data *fd;
68 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
72 fd->fd_write_failed = false;
77 static void ll_file_data_put(struct ll_file_data *fd)
80 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
83 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
84 struct lustre_handle *fh)
86 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
87 op_data->op_attr.ia_mode = inode->i_mode;
88 op_data->op_attr.ia_atime = inode->i_atime;
89 op_data->op_attr.ia_mtime = inode->i_mtime;
90 op_data->op_attr.ia_ctime = inode->i_ctime;
91 op_data->op_attr.ia_size = i_size_read(inode);
92 op_data->op_attr_blocks = inode->i_blocks;
93 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
94 ll_inode_to_ext_flags(inode->i_flags);
95 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
97 op_data->op_handle = *fh;
98 op_data->op_capa1 = ll_mdscapa_get(inode);
100 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
101 op_data->op_bias |= MDS_DATA_MODIFIED;
105 * Closes the IO epoch and packs all the attributes into @op_data for
108 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
109 struct obd_client_handle *och)
113 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
114 ATTR_MTIME | ATTR_MTIME_SET |
115 ATTR_CTIME | ATTR_CTIME_SET;
117 if (!(och->och_flags & FMODE_WRITE))
120 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
121 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
123 ll_ioepoch_close(inode, op_data, &och, 0);
126 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
127 ll_prep_md_op_data(op_data, inode, NULL, NULL,
128 0, 0, LUSTRE_OPC_ANY, NULL);
132 static int ll_close_inode_openhandle(struct obd_export *md_exp,
134 struct obd_client_handle *och,
135 const __u64 *data_version)
137 struct obd_export *exp = ll_i2mdexp(inode);
138 struct md_op_data *op_data;
139 struct ptlrpc_request *req = NULL;
140 struct obd_device *obd = class_exp2obd(exp);
147 * XXX: in case of LMV, is this correct to access
150 CERROR("Invalid MDC connection handle "LPX64"\n",
151 ll_i2mdexp(inode)->exp_handle.h_cookie);
155 OBD_ALLOC_PTR(op_data);
157 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
159 ll_prepare_close(inode, op_data, och);
160 if (data_version != NULL) {
161 /* Pass in data_version implies release. */
162 op_data->op_bias |= MDS_HSM_RELEASE;
163 op_data->op_data_version = *data_version;
164 op_data->op_lease_handle = och->och_lease_handle;
165 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
167 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
168 rc = md_close(md_exp, op_data, och->och_mod, &req);
170 /* This close must have the epoch closed. */
171 LASSERT(epoch_close);
172 /* MDS has instructed us to obtain Size-on-MDS attribute from
173 * OSTs and send setattr to back to MDS. */
174 rc = ll_som_update(inode, op_data);
176 CERROR("%s: inode "DFID" mdc Size-on-MDS update"
177 " failed: rc = %d\n",
178 ll_i2mdexp(inode)->exp_obd->obd_name,
179 PFID(ll_inode2fid(inode)), rc);
183 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
184 ll_i2mdexp(inode)->exp_obd->obd_name,
185 PFID(ll_inode2fid(inode)), rc);
188 /* DATA_MODIFIED flag was successfully sent on close, cancel data
189 * modification flag. */
190 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
191 struct ll_inode_info *lli = ll_i2info(inode);
193 spin_lock(&lli->lli_lock);
194 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
195 spin_unlock(&lli->lli_lock);
199 rc = ll_objects_destroy(req, inode);
201 CERROR("%s: inode "DFID
202 " ll_objects destroy: rc = %d\n",
203 ll_i2mdexp(inode)->exp_obd->obd_name,
204 PFID(ll_inode2fid(inode)), rc);
207 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
208 struct mdt_body *body;
209 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
210 if (!(body->mbo_valid & OBD_MD_FLRELEASED))
214 ll_finish_md_op_data(op_data);
218 if (exp_connect_som(exp) && !epoch_close &&
219 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
220 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
222 md_clear_open_replay_data(md_exp, och);
223 /* Free @och if it is not waiting for DONE_WRITING. */
224 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
227 if (req) /* This is close request */
228 ptlrpc_req_finished(req);
232 int ll_md_real_close(struct inode *inode, fmode_t fmode)
234 struct ll_inode_info *lli = ll_i2info(inode);
235 struct obd_client_handle **och_p;
236 struct obd_client_handle *och;
241 if (fmode & FMODE_WRITE) {
242 och_p = &lli->lli_mds_write_och;
243 och_usecount = &lli->lli_open_fd_write_count;
244 } else if (fmode & FMODE_EXEC) {
245 och_p = &lli->lli_mds_exec_och;
246 och_usecount = &lli->lli_open_fd_exec_count;
248 LASSERT(fmode & FMODE_READ);
249 och_p = &lli->lli_mds_read_och;
250 och_usecount = &lli->lli_open_fd_read_count;
253 mutex_lock(&lli->lli_och_mutex);
254 if (*och_usecount > 0) {
255 /* There are still users of this handle, so skip
257 mutex_unlock(&lli->lli_och_mutex);
263 mutex_unlock(&lli->lli_och_mutex);
266 /* There might be a race and this handle may already
268 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
275 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
278 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
279 struct ll_inode_info *lli = ll_i2info(inode);
283 /* clear group lock, if present */
284 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
285 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
287 if (fd->fd_lease_och != NULL) {
290 /* Usually the lease is not released when the
291 * application crashed, we need to release here. */
292 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
293 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
294 PFID(&lli->lli_fid), rc, lease_broken);
296 fd->fd_lease_och = NULL;
299 if (fd->fd_och != NULL) {
300 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
305 /* Let's see if we have good enough OPEN lock on the file and if
306 we can skip talking to MDS */
307 if (file->f_dentry->d_inode) { /* Can this ever be false? */
309 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
310 struct lustre_handle lockh;
311 struct inode *inode = file->f_dentry->d_inode;
312 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
314 mutex_lock(&lli->lli_och_mutex);
315 if (fd->fd_omode & FMODE_WRITE) {
317 LASSERT(lli->lli_open_fd_write_count);
318 lli->lli_open_fd_write_count--;
319 } else if (fd->fd_omode & FMODE_EXEC) {
321 LASSERT(lli->lli_open_fd_exec_count);
322 lli->lli_open_fd_exec_count--;
325 LASSERT(lli->lli_open_fd_read_count);
326 lli->lli_open_fd_read_count--;
328 mutex_unlock(&lli->lli_och_mutex);
330 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
331 LDLM_IBITS, &policy, lockmode,
333 rc = ll_md_real_close(file->f_dentry->d_inode,
337 CERROR("released file has negative dentry: file = %p, "
338 "dentry = %p, name = %s\n",
339 file, file->f_dentry, file->f_dentry->d_name.name);
343 LUSTRE_FPRIVATE(file) = NULL;
344 ll_file_data_put(fd);
345 ll_capa_close(inode);
350 /* While this returns an error code, fput() the caller does not, so we need
351 * to make every effort to clean up all of our state here. Also, applications
352 * rarely check close errors and even if an error is returned they will not
353 * re-try the close call.
355 int ll_file_release(struct inode *inode, struct file *file)
357 struct ll_file_data *fd;
358 struct ll_sb_info *sbi = ll_i2sbi(inode);
359 struct ll_inode_info *lli = ll_i2info(inode);
363 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
364 PFID(ll_inode2fid(inode)), inode);
366 #ifdef CONFIG_FS_POSIX_ACL
367 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
368 inode == inode->i_sb->s_root->d_inode) {
369 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
372 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
373 fd->fd_flags &= ~LL_FILE_RMTACL;
374 rct_del(&sbi->ll_rct, current_pid());
375 et_search_free(&sbi->ll_et, current_pid());
380 if (inode->i_sb->s_root != file->f_dentry)
381 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
382 fd = LUSTRE_FPRIVATE(file);
385 /* The last ref on @file, maybe not the the owner pid of statahead,
386 * because parent and child process can share the same file handle. */
387 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
388 ll_deauthorize_statahead(inode, fd);
390 if (inode->i_sb->s_root == file->f_dentry) {
391 LUSTRE_FPRIVATE(file) = NULL;
392 ll_file_data_put(fd);
396 if (!S_ISDIR(inode->i_mode)) {
397 if (lli->lli_clob != NULL)
398 lov_read_and_clear_async_rc(lli->lli_clob);
399 lli->lli_async_rc = 0;
402 rc = ll_md_close(sbi->ll_md_exp, inode, file);
404 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
405 libcfs_debug_dumplog();
410 static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize,
411 struct lookup_intent *itp)
413 struct dentry *de = file->f_dentry;
414 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
415 struct dentry *parent = de->d_parent;
416 const char *name = NULL;
418 struct md_op_data *op_data;
419 struct ptlrpc_request *req = NULL;
423 LASSERT(parent != NULL);
424 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
426 /* if server supports open-by-fid, or file name is invalid, don't pack
427 * name in open request */
428 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
429 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
430 name = de->d_name.name;
431 len = de->d_name.len;
434 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
435 name, len, 0, LUSTRE_OPC_ANY, NULL);
437 RETURN(PTR_ERR(op_data));
438 op_data->op_data = lmm;
439 op_data->op_data_size = lmmsize;
441 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
442 &ll_md_blocking_ast, 0);
443 ll_finish_md_op_data(op_data);
445 /* reason for keep own exit path - don`t flood log
446 * with messages with -ESTALE errors.
448 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
449 it_open_error(DISP_OPEN_OPEN, itp))
451 ll_release_openhandle(de, itp);
455 if (it_disposition(itp, DISP_LOOKUP_NEG))
456 GOTO(out, rc = -ENOENT);
458 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
459 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
460 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
464 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
465 if (!rc && itp->d.lustre.it_lock_mode)
466 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
469 ptlrpc_req_finished(req);
470 ll_intent_drop_lock(itp);
476 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
477 * not believe attributes if a few ioepoch holders exist. Attributes for
478 * previous ioepoch if new one is opened are also skipped by MDS.
480 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
482 if (ioepoch && lli->lli_ioepoch != ioepoch) {
483 lli->lli_ioepoch = ioepoch;
484 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
485 ioepoch, PFID(&lli->lli_fid));
489 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
490 struct obd_client_handle *och)
492 struct ptlrpc_request *req = it->d.lustre.it_data;
493 struct mdt_body *body;
495 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
496 och->och_fh = body->mbo_handle;
497 och->och_fid = body->mbo_fid1;
498 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
499 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
500 och->och_flags = it->it_flags;
502 return md_set_open_replay_data(md_exp, och, it);
505 static int ll_local_open(struct file *file, struct lookup_intent *it,
506 struct ll_file_data *fd, struct obd_client_handle *och)
508 struct inode *inode = file->f_dentry->d_inode;
509 struct ll_inode_info *lli = ll_i2info(inode);
512 LASSERT(!LUSTRE_FPRIVATE(file));
517 struct ptlrpc_request *req = it->d.lustre.it_data;
518 struct mdt_body *body;
521 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
525 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
526 ll_ioepoch_open(lli, body->mbo_ioepoch);
529 LUSTRE_FPRIVATE(file) = fd;
530 ll_readahead_init(inode, &fd->fd_ras);
531 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
533 /* ll_cl_context initialize */
534 rwlock_init(&fd->fd_lock);
535 INIT_LIST_HEAD(&fd->fd_lccs);
540 /* Open a file, and (for the very first open) create objects on the OSTs at
541 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
542 * creation or open until ll_lov_setstripe() ioctl is called.
544 * If we already have the stripe MD locally then we don't request it in
545 * md_open(), by passing a lmm_size = 0.
547 * It is up to the application to ensure no other processes open this file
548 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
549 * used. We might be able to avoid races of that sort by getting lli_open_sem
550 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
551 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
553 int ll_file_open(struct inode *inode, struct file *file)
555 struct ll_inode_info *lli = ll_i2info(inode);
556 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
557 .it_flags = file->f_flags };
558 struct obd_client_handle **och_p = NULL;
559 __u64 *och_usecount = NULL;
560 struct ll_file_data *fd;
564 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
565 PFID(ll_inode2fid(inode)), inode, file->f_flags);
567 it = file->private_data; /* XXX: compat macro */
568 file->private_data = NULL; /* prevent ll_local_open assertion */
570 fd = ll_file_data_get();
572 GOTO(out_openerr, rc = -ENOMEM);
575 if (S_ISDIR(inode->i_mode))
576 ll_authorize_statahead(inode, fd);
578 if (inode->i_sb->s_root == file->f_dentry) {
579 LUSTRE_FPRIVATE(file) = fd;
583 if (!it || !it->d.lustre.it_disposition) {
584 /* Convert f_flags into access mode. We cannot use file->f_mode,
585 * because everything but O_ACCMODE mask was stripped from
587 if ((oit.it_flags + 1) & O_ACCMODE)
589 if (file->f_flags & O_TRUNC)
590 oit.it_flags |= FMODE_WRITE;
592 /* kernel only call f_op->open in dentry_open. filp_open calls
593 * dentry_open after call to open_namei that checks permissions.
594 * Only nfsd_open call dentry_open directly without checking
595 * permissions and because of that this code below is safe. */
596 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
597 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
599 /* We do not want O_EXCL here, presumably we opened the file
600 * already? XXX - NFS implications? */
601 oit.it_flags &= ~O_EXCL;
603 /* bug20584, if "it_flags" contains O_CREAT, the file will be
604 * created if necessary, then "IT_CREAT" should be set to keep
605 * consistent with it */
606 if (oit.it_flags & O_CREAT)
607 oit.it_op |= IT_CREAT;
613 /* Let's see if we have file open on MDS already. */
614 if (it->it_flags & FMODE_WRITE) {
615 och_p = &lli->lli_mds_write_och;
616 och_usecount = &lli->lli_open_fd_write_count;
617 } else if (it->it_flags & FMODE_EXEC) {
618 och_p = &lli->lli_mds_exec_och;
619 och_usecount = &lli->lli_open_fd_exec_count;
621 och_p = &lli->lli_mds_read_och;
622 och_usecount = &lli->lli_open_fd_read_count;
625 mutex_lock(&lli->lli_och_mutex);
626 if (*och_p) { /* Open handle is present */
627 if (it_disposition(it, DISP_OPEN_OPEN)) {
628 /* Well, there's extra open request that we do not need,
629 let's close it somehow. This will decref request. */
630 rc = it_open_error(DISP_OPEN_OPEN, it);
632 mutex_unlock(&lli->lli_och_mutex);
633 GOTO(out_openerr, rc);
636 ll_release_openhandle(file->f_dentry, it);
640 rc = ll_local_open(file, it, fd, NULL);
643 mutex_unlock(&lli->lli_och_mutex);
644 GOTO(out_openerr, rc);
647 LASSERT(*och_usecount == 0);
648 if (!it->d.lustre.it_disposition) {
649 /* We cannot just request lock handle now, new ELC code
650 means that one of other OPEN locks for this file
651 could be cancelled, and since blocking ast handler
652 would attempt to grab och_mutex as well, that would
653 result in a deadlock */
654 mutex_unlock(&lli->lli_och_mutex);
656 * Normally called under two situations:
658 * 2. A race/condition on MDS resulting in no open
659 * handle to be returned from LOOKUP|OPEN request,
660 * for example if the target entry was a symlink.
662 * Always fetch MDS_OPEN_LOCK if this is not setstripe.
664 * Always specify MDS_OPEN_BY_FID because we don't want
665 * to get file with different fid.
667 it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID;
668 rc = ll_intent_file_open(file, NULL, 0, it);
670 GOTO(out_openerr, rc);
674 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
676 GOTO(out_och_free, rc = -ENOMEM);
680 /* md_intent_lock() didn't get a request ref if there was an
681 * open error, so don't do cleanup on the request here
683 /* XXX (green): Should not we bail out on any error here, not
684 * just open error? */
685 rc = it_open_error(DISP_OPEN_OPEN, it);
687 GOTO(out_och_free, rc);
689 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
690 "inode %p: disposition %x, status %d\n", inode,
691 it_disposition(it, ~0), it->d.lustre.it_status);
693 rc = ll_local_open(file, it, fd, *och_p);
695 GOTO(out_och_free, rc);
697 mutex_unlock(&lli->lli_och_mutex);
700 /* Must do this outside lli_och_mutex lock to prevent deadlock where
701 different kind of OPEN lock for this same inode gets cancelled
702 by ldlm_cancel_lru */
703 if (!S_ISREG(inode->i_mode))
704 GOTO(out_och_free, rc);
708 if (!lli->lli_has_smd &&
709 (cl_is_lov_delay_create(file->f_flags) ||
710 (file->f_mode & FMODE_WRITE) == 0)) {
711 CDEBUG(D_INODE, "object creation was delayed\n");
712 GOTO(out_och_free, rc);
714 cl_lov_delay_create_clear(&file->f_flags);
715 GOTO(out_och_free, rc);
719 if (och_p && *och_p) {
720 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
721 *och_p = NULL; /* OBD_FREE writes some magic there */
724 mutex_unlock(&lli->lli_och_mutex);
727 if (lli->lli_opendir_key == fd)
728 ll_deauthorize_statahead(inode, fd);
730 ll_file_data_put(fd);
732 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
735 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
736 ptlrpc_req_finished(it->d.lustre.it_data);
737 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
743 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
744 struct ldlm_lock_desc *desc, void *data, int flag)
747 struct lustre_handle lockh;
751 case LDLM_CB_BLOCKING:
752 ldlm_lock2handle(lock, &lockh);
753 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
755 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
759 case LDLM_CB_CANCELING:
767 * Acquire a lease and open the file.
769 static struct obd_client_handle *
770 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
773 struct lookup_intent it = { .it_op = IT_OPEN };
774 struct ll_sb_info *sbi = ll_i2sbi(inode);
775 struct md_op_data *op_data;
776 struct ptlrpc_request *req = NULL;
777 struct lustre_handle old_handle = { 0 };
778 struct obd_client_handle *och = NULL;
783 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
784 RETURN(ERR_PTR(-EINVAL));
787 struct ll_inode_info *lli = ll_i2info(inode);
788 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
789 struct obd_client_handle **och_p;
792 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
793 RETURN(ERR_PTR(-EPERM));
795 /* Get the openhandle of the file */
797 mutex_lock(&lli->lli_och_mutex);
798 if (fd->fd_lease_och != NULL) {
799 mutex_unlock(&lli->lli_och_mutex);
803 if (fd->fd_och == NULL) {
804 if (file->f_mode & FMODE_WRITE) {
805 LASSERT(lli->lli_mds_write_och != NULL);
806 och_p = &lli->lli_mds_write_och;
807 och_usecount = &lli->lli_open_fd_write_count;
809 LASSERT(lli->lli_mds_read_och != NULL);
810 och_p = &lli->lli_mds_read_och;
811 och_usecount = &lli->lli_open_fd_read_count;
813 if (*och_usecount == 1) {
820 mutex_unlock(&lli->lli_och_mutex);
821 if (rc < 0) /* more than 1 opener */
824 LASSERT(fd->fd_och != NULL);
825 old_handle = fd->fd_och->och_fh;
830 RETURN(ERR_PTR(-ENOMEM));
832 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
833 LUSTRE_OPC_ANY, NULL);
835 GOTO(out, rc = PTR_ERR(op_data));
837 /* To tell the MDT this openhandle is from the same owner */
838 op_data->op_handle = old_handle;
840 it.it_flags = fmode | open_flags;
841 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
842 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
843 &ll_md_blocking_lease_ast,
844 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
845 * it can be cancelled which may mislead applications that the lease is
847 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
848 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
849 * doesn't deal with openhandle, so normal openhandle will be leaked. */
850 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
851 ll_finish_md_op_data(op_data);
852 ptlrpc_req_finished(req);
854 GOTO(out_release_it, rc);
856 if (it_disposition(&it, DISP_LOOKUP_NEG))
857 GOTO(out_release_it, rc = -ENOENT);
859 rc = it_open_error(DISP_OPEN_OPEN, &it);
861 GOTO(out_release_it, rc);
863 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
864 ll_och_fill(sbi->ll_md_exp, &it, och);
866 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
867 GOTO(out_close, rc = -EOPNOTSUPP);
869 /* already get lease, handle lease lock */
870 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
871 if (it.d.lustre.it_lock_mode == 0 ||
872 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
873 /* open lock must return for lease */
874 CERROR(DFID "lease granted but no open lock, %d/"LPU64".\n",
875 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
876 it.d.lustre.it_lock_bits);
877 GOTO(out_close, rc = -EPROTO);
880 ll_intent_release(&it);
884 /* Cancel open lock */
885 if (it.d.lustre.it_lock_mode != 0) {
886 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
887 it.d.lustre.it_lock_mode);
888 it.d.lustre.it_lock_mode = 0;
889 och->och_lease_handle.cookie = 0ULL;
891 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
893 CERROR("%s: error closing file "DFID": %d\n",
894 ll_get_fsname(inode->i_sb, NULL, 0),
895 PFID(&ll_i2info(inode)->lli_fid), rc2);
896 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
898 ll_intent_release(&it);
906 * Release lease and close the file.
907 * It will check if the lease has ever broken.
909 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
912 struct ldlm_lock *lock;
913 bool cancelled = true;
917 lock = ldlm_handle2lock(&och->och_lease_handle);
919 lock_res_and_lock(lock);
920 cancelled = ldlm_is_cancel(lock);
921 unlock_res_and_lock(lock);
925 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
926 PFID(&ll_i2info(inode)->lli_fid), cancelled);
929 ldlm_cli_cancel(&och->och_lease_handle, 0);
930 if (lease_broken != NULL)
931 *lease_broken = cancelled;
933 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
938 /* Fills the obdo with the attributes for the lsm */
939 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
940 struct obd_capa *capa, struct obdo *obdo,
941 __u64 ioepoch, int dv_flags)
943 struct ptlrpc_request_set *set;
944 struct obd_info oinfo = { { { 0 } } };
949 LASSERT(lsm != NULL);
953 oinfo.oi_oa->o_oi = lsm->lsm_oi;
954 oinfo.oi_oa->o_mode = S_IFREG;
955 oinfo.oi_oa->o_ioepoch = ioepoch;
956 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
957 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
958 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
959 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
960 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
961 OBD_MD_FLDATAVERSION;
962 oinfo.oi_capa = capa;
963 if (dv_flags & (LL_DV_WR_FLUSH | LL_DV_RD_FLUSH)) {
964 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
965 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
966 if (dv_flags & LL_DV_WR_FLUSH)
967 oinfo.oi_oa->o_flags |= OBD_FL_FLUSH;
970 set = ptlrpc_prep_set();
972 CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM);
975 rc = obd_getattr_async(exp, &oinfo, set);
977 rc = ptlrpc_set_wait(set);
978 ptlrpc_set_destroy(set);
981 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
982 OBD_MD_FLATIME | OBD_MD_FLMTIME |
983 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
984 OBD_MD_FLDATAVERSION | OBD_MD_FLFLAGS);
985 if (dv_flags & LL_DV_WR_FLUSH &&
986 !(oinfo.oi_oa->o_valid & OBD_MD_FLFLAGS &&
987 oinfo.oi_oa->o_flags & OBD_FL_FLUSH))
994 * Performs the getattr on the inode and updates its fields.
995 * If @sync != 0, perform the getattr under the server-side lock.
997 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
998 __u64 ioepoch, int sync)
1000 struct obd_capa *capa = ll_mdscapa_get(inode);
1001 struct lov_stripe_md *lsm;
1005 lsm = ccc_inode_lsm_get(inode);
1006 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
1007 capa, obdo, ioepoch, sync ? LL_DV_RD_FLUSH : 0);
1010 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
1012 obdo_refresh_inode(inode, obdo, obdo->o_valid);
1013 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
1014 " blksize %lu\n", POSTID(oi), i_size_read(inode),
1015 (unsigned long long)inode->i_blocks,
1016 1UL << inode->i_blkbits);
1018 ccc_inode_lsm_put(inode, lsm);
1022 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1024 struct ll_inode_info *lli = ll_i2info(inode);
1025 struct cl_object *obj = lli->lli_clob;
1026 struct cl_attr *attr = ccc_env_thread_attr(env);
1032 ll_inode_size_lock(inode);
1033 /* merge timestamps the most recently obtained from mds with
1034 timestamps obtained from osts */
1035 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1036 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1037 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1039 lvb.lvb_size = i_size_read(inode);
1040 lvb.lvb_blocks = inode->i_blocks;
1041 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1042 lvb.lvb_atime = LTIME_S(inode->i_atime);
1043 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1045 cl_object_attr_lock(obj);
1046 rc = cl_object_attr_get(env, obj, attr);
1047 cl_object_attr_unlock(obj);
1050 if (lvb.lvb_atime < attr->cat_atime)
1051 lvb.lvb_atime = attr->cat_atime;
1052 if (lvb.lvb_ctime < attr->cat_ctime)
1053 lvb.lvb_ctime = attr->cat_ctime;
1054 if (lvb.lvb_mtime < attr->cat_mtime)
1055 lvb.lvb_mtime = attr->cat_mtime;
1057 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1058 PFID(&lli->lli_fid), attr->cat_size);
1059 cl_isize_write_nolock(inode, attr->cat_size);
1061 inode->i_blocks = attr->cat_blocks;
1063 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1064 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1065 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1067 ll_inode_size_unlock(inode);
1072 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1075 struct obdo obdo = { 0 };
1078 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1080 st->st_size = obdo.o_size;
1081 st->st_blocks = obdo.o_blocks;
1082 st->st_mtime = obdo.o_mtime;
1083 st->st_atime = obdo.o_atime;
1084 st->st_ctime = obdo.o_ctime;
1089 static bool file_is_noatime(const struct file *file)
1091 const struct vfsmount *mnt = file->f_path.mnt;
1092 const struct inode *inode = file->f_path.dentry->d_inode;
1094 /* Adapted from file_accessed() and touch_atime().*/
1095 if (file->f_flags & O_NOATIME)
1098 if (inode->i_flags & S_NOATIME)
1101 if (IS_NOATIME(inode))
1104 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1107 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1110 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1116 static void ll_io_init(struct cl_io *io, const struct file *file, int write)
1118 struct inode *inode = file->f_dentry->d_inode;
1120 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1122 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1123 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1124 file->f_flags & O_DIRECT ||
1127 io->ci_obj = ll_i2info(inode)->lli_clob;
1128 io->ci_lockreq = CILR_MAYBE;
1129 if (ll_file_nolock(file)) {
1130 io->ci_lockreq = CILR_NEVER;
1131 io->ci_no_srvlock = 1;
1132 } else if (file->f_flags & O_APPEND) {
1133 io->ci_lockreq = CILR_MANDATORY;
1136 io->ci_noatime = file_is_noatime(file);
1140 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1141 struct file *file, enum cl_io_type iot,
1142 loff_t *ppos, size_t count)
1144 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1145 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1148 struct range_lock range;
1151 CDEBUG(D_VFSTRACE, "file: %s, type: %d ppos: "LPU64", count: %zd\n",
1152 file->f_dentry->d_name.name, iot, *ppos, count);
1155 io = ccc_env_thread_io(env);
1156 ll_io_init(io, file, iot == CIT_WRITE);
1158 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1159 struct vvp_io *vio = vvp_env_io(env);
1160 struct ccc_io *cio = ccc_env_io(env);
1161 bool range_locked = false;
1163 if (file->f_flags & O_APPEND)
1164 range_lock_init(&range, 0, LUSTRE_EOF);
1166 range_lock_init(&range, *ppos, *ppos + count - 1);
1167 cio->cui_fd = LUSTRE_FPRIVATE(file);
1168 vio->cui_io_subtype = args->via_io_subtype;
1170 switch (vio->cui_io_subtype) {
1172 cio->cui_iov = args->u.normal.via_iov;
1173 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1174 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1175 cio->cui_iocb = args->u.normal.via_iocb;
1176 if ((iot == CIT_WRITE) &&
1177 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1178 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1180 result = range_lock(&lli->lli_write_tree,
1185 range_locked = true;
1187 down_read(&lli->lli_trunc_sem);
1190 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1191 vio->u.splice.cui_flags = args->u.splice.via_flags;
1194 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1198 ll_cl_add(file, env, io);
1199 result = cl_io_loop(env, io);
1200 ll_cl_remove(file, env);
1202 if (args->via_io_subtype == IO_NORMAL)
1203 up_read(&lli->lli_trunc_sem);
1205 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1207 range_unlock(&lli->lli_write_tree, &range);
1210 /* cl_io_rw_init() handled IO */
1211 result = io->ci_result;
1214 if (io->ci_nob > 0) {
1215 result = io->ci_nob;
1216 *ppos = io->u.ci_wr.wr.crw_pos;
1220 cl_io_fini(env, io);
1221 /* If any bit been read/written (result != 0), we just return
1222 * short read/write instead of restart io. */
1223 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1224 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1225 iot == CIT_READ ? "read" : "write",
1226 file->f_dentry->d_name.name, *ppos, count);
1227 LASSERTF(io->ci_nob == 0, "%zd\n", io->ci_nob);
1231 if (iot == CIT_READ) {
1233 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1234 LPROC_LL_READ_BYTES, result);
1235 } else if (iot == CIT_WRITE) {
1237 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1238 LPROC_LL_WRITE_BYTES, result);
1239 fd->fd_write_failed = false;
1240 } else if (result != -ERESTARTSYS) {
1241 fd->fd_write_failed = true;
1244 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1251 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1253 static int ll_file_get_iov_count(const struct iovec *iov,
1254 unsigned long *nr_segs, size_t *count)
1259 for (seg = 0; seg < *nr_segs; seg++) {
1260 const struct iovec *iv = &iov[seg];
1263 * If any segment has a negative length, or the cumulative
1264 * length ever wraps negative then return -EINVAL.
1267 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1269 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1274 cnt -= iv->iov_len; /* This segment is no good */
1281 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1282 unsigned long nr_segs, loff_t pos)
1285 struct vvp_io_args *args;
1291 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1295 env = cl_env_get(&refcheck);
1297 RETURN(PTR_ERR(env));
1299 args = vvp_env_args(env, IO_NORMAL);
1300 args->u.normal.via_iov = (struct iovec *)iov;
1301 args->u.normal.via_nrsegs = nr_segs;
1302 args->u.normal.via_iocb = iocb;
1304 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1305 &iocb->ki_pos, count);
1306 cl_env_put(env, &refcheck);
1310 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1314 struct iovec *local_iov;
1315 struct kiocb *kiocb;
1320 env = cl_env_get(&refcheck);
1322 RETURN(PTR_ERR(env));
1324 local_iov = &vvp_env_info(env)->vti_local_iov;
1325 kiocb = &vvp_env_info(env)->vti_kiocb;
1326 local_iov->iov_base = (void __user *)buf;
1327 local_iov->iov_len = count;
1328 init_sync_kiocb(kiocb, file);
1329 kiocb->ki_pos = *ppos;
1330 #ifdef HAVE_KIOCB_KI_LEFT
1331 kiocb->ki_left = count;
1333 kiocb->ki_nbytes = count;
1336 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1337 *ppos = kiocb->ki_pos;
1339 cl_env_put(env, &refcheck);
1344 * Write to a file (through the page cache).
1347 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1348 unsigned long nr_segs, loff_t pos)
1351 struct vvp_io_args *args;
1357 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1361 env = cl_env_get(&refcheck);
1363 RETURN(PTR_ERR(env));
1365 args = vvp_env_args(env, IO_NORMAL);
1366 args->u.normal.via_iov = (struct iovec *)iov;
1367 args->u.normal.via_nrsegs = nr_segs;
1368 args->u.normal.via_iocb = iocb;
1370 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1371 &iocb->ki_pos, count);
1372 cl_env_put(env, &refcheck);
1376 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1377 size_t count, loff_t *ppos)
1380 struct iovec *local_iov;
1381 struct kiocb *kiocb;
1386 env = cl_env_get(&refcheck);
1388 RETURN(PTR_ERR(env));
1390 local_iov = &vvp_env_info(env)->vti_local_iov;
1391 kiocb = &vvp_env_info(env)->vti_kiocb;
1392 local_iov->iov_base = (void __user *)buf;
1393 local_iov->iov_len = count;
1394 init_sync_kiocb(kiocb, file);
1395 kiocb->ki_pos = *ppos;
1396 #ifdef HAVE_KIOCB_KI_LEFT
1397 kiocb->ki_left = count;
1399 kiocb->ki_nbytes = count;
1402 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1403 *ppos = kiocb->ki_pos;
1405 cl_env_put(env, &refcheck);
1410 * Send file content (through pagecache) somewhere with helper
1412 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1413 struct pipe_inode_info *pipe, size_t count,
1417 struct vvp_io_args *args;
1422 env = cl_env_get(&refcheck);
1424 RETURN(PTR_ERR(env));
1426 args = vvp_env_args(env, IO_SPLICE);
1427 args->u.splice.via_pipe = pipe;
1428 args->u.splice.via_flags = flags;
1430 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1431 cl_env_put(env, &refcheck);
1435 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1438 struct obd_export *exp = ll_i2dtexp(inode);
1439 struct obd_trans_info oti = { 0 };
1440 struct obdo *oa = NULL;
1443 struct lov_stripe_md *lsm = NULL, *lsm2;
1450 lsm = ccc_inode_lsm_get(inode);
1451 if (!lsm_has_objects(lsm))
1452 GOTO(out, rc = -ENOENT);
1454 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1455 (lsm->lsm_stripe_count));
1457 OBD_ALLOC_LARGE(lsm2, lsm_size);
1459 GOTO(out, rc = -ENOMEM);
1462 oa->o_nlink = ost_idx;
1463 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1464 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1465 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1466 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1467 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1468 memcpy(lsm2, lsm, lsm_size);
1469 ll_inode_size_lock(inode);
1470 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1471 ll_inode_size_unlock(inode);
1473 OBD_FREE_LARGE(lsm2, lsm_size);
1476 ccc_inode_lsm_put(inode, lsm);
1481 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1483 struct ll_recreate_obj ucreat;
1487 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1490 if (copy_from_user(&ucreat, (struct ll_recreate_obj __user *)arg,
1494 ostid_set_seq_mdt0(&oi);
1495 ostid_set_id(&oi, ucreat.lrc_id);
1496 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1499 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1506 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1509 if (copy_from_user(&fid, (struct lu_fid __user *)arg, sizeof(fid)))
1512 fid_to_ostid(&fid, &oi);
1513 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1514 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1517 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1518 __u64 flags, struct lov_user_md *lum,
1521 struct lov_stripe_md *lsm = NULL;
1522 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1526 lsm = ccc_inode_lsm_get(inode);
1528 ccc_inode_lsm_put(inode, lsm);
1529 CDEBUG(D_IOCTL, "stripe already exists for inode "DFID"\n",
1530 PFID(ll_inode2fid(inode)));
1531 GOTO(out, rc = -EEXIST);
1534 ll_inode_size_lock(inode);
1535 oit.it_flags |= MDS_OPEN_BY_FID;
1536 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1538 GOTO(out_unlock, rc);
1539 rc = oit.d.lustre.it_status;
1541 GOTO(out_req_free, rc);
1543 ll_release_openhandle(file->f_dentry, &oit);
1546 ll_inode_size_unlock(inode);
1547 ll_intent_release(&oit);
1548 ccc_inode_lsm_put(inode, lsm);
1550 cl_lov_delay_create_clear(&file->f_flags);
1553 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1557 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1558 struct lov_mds_md **lmmp, int *lmm_size,
1559 struct ptlrpc_request **request)
1561 struct ll_sb_info *sbi = ll_i2sbi(inode);
1562 struct mdt_body *body;
1563 struct lov_mds_md *lmm = NULL;
1564 struct ptlrpc_request *req = NULL;
1565 struct md_op_data *op_data;
1568 rc = ll_get_default_mdsize(sbi, &lmmsize);
1572 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1573 strlen(filename), lmmsize,
1574 LUSTRE_OPC_ANY, NULL);
1575 if (IS_ERR(op_data))
1576 RETURN(PTR_ERR(op_data));
1578 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1579 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1580 ll_finish_md_op_data(op_data);
1582 CDEBUG(D_INFO, "md_getattr_name failed "
1583 "on %s: rc %d\n", filename, rc);
1587 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1588 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1590 lmmsize = body->mbo_eadatasize;
1592 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1594 GOTO(out, rc = -ENODATA);
1597 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1598 LASSERT(lmm != NULL);
1600 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1601 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1602 GOTO(out, rc = -EPROTO);
1606 * This is coming from the MDS, so is probably in
1607 * little endian. We convert it to host endian before
1608 * passing it to userspace.
1610 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1613 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1614 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1617 /* if function called for directory - we should
1618 * avoid swab not existent lsm objects */
1619 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1620 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1621 if (S_ISREG(body->mbo_mode))
1622 lustre_swab_lov_user_md_objects(
1623 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1625 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1626 lustre_swab_lov_user_md_v3(
1627 (struct lov_user_md_v3 *)lmm);
1628 if (S_ISREG(body->mbo_mode))
1629 lustre_swab_lov_user_md_objects(
1630 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1637 *lmm_size = lmmsize;
1642 static int ll_lov_setea(struct inode *inode, struct file *file,
1645 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1646 struct lov_user_md *lump;
1647 int lum_size = sizeof(struct lov_user_md) +
1648 sizeof(struct lov_user_ost_data);
1652 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1655 OBD_ALLOC_LARGE(lump, lum_size);
1659 if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) {
1660 OBD_FREE_LARGE(lump, lum_size);
1664 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1666 OBD_FREE_LARGE(lump, lum_size);
1670 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1673 struct lov_user_md_v3 lumv3;
1674 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1675 struct lov_user_md_v1 __user *lumv1p =
1676 (struct lov_user_md_v1 __user *)arg;
1677 struct lov_user_md_v3 __user *lumv3p =
1678 (struct lov_user_md_v3 __user *)arg;
1680 __u64 flags = FMODE_WRITE;
1683 /* first try with v1 which is smaller than v3 */
1684 lum_size = sizeof(struct lov_user_md_v1);
1685 if (copy_from_user(lumv1, lumv1p, lum_size))
1688 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1689 lum_size = sizeof(struct lov_user_md_v3);
1690 if (copy_from_user(&lumv3, lumv3p, lum_size))
1694 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1696 struct lov_stripe_md *lsm;
1699 put_user(0, &lumv1p->lmm_stripe_count);
1701 ll_layout_refresh(inode, &gen);
1702 lsm = ccc_inode_lsm_get(inode);
1703 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1704 0, lsm, (void __user *)arg);
1705 ccc_inode_lsm_put(inode, lsm);
1710 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1712 struct lov_stripe_md *lsm;
1716 lsm = ccc_inode_lsm_get(inode);
1718 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1719 lsm, (void __user *)arg);
1720 ccc_inode_lsm_put(inode, lsm);
1725 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1727 struct ll_inode_info *lli = ll_i2info(inode);
1728 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1729 struct ccc_grouplock grouplock;
1733 if (ll_file_nolock(file))
1734 RETURN(-EOPNOTSUPP);
1736 spin_lock(&lli->lli_lock);
1737 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1738 CWARN("group lock already existed with gid %lu\n",
1739 fd->fd_grouplock.cg_gid);
1740 spin_unlock(&lli->lli_lock);
1743 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1744 spin_unlock(&lli->lli_lock);
1746 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1747 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1751 spin_lock(&lli->lli_lock);
1752 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1753 spin_unlock(&lli->lli_lock);
1754 CERROR("another thread just won the race\n");
1755 cl_put_grouplock(&grouplock);
1759 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1760 fd->fd_grouplock = grouplock;
1761 spin_unlock(&lli->lli_lock);
1763 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1767 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1769 struct ll_inode_info *lli = ll_i2info(inode);
1770 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1771 struct ccc_grouplock grouplock;
1774 spin_lock(&lli->lli_lock);
1775 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1776 spin_unlock(&lli->lli_lock);
1777 CWARN("no group lock held\n");
1780 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1782 if (fd->fd_grouplock.cg_gid != arg) {
1783 CWARN("group lock %lu doesn't match current id %lu\n",
1784 arg, fd->fd_grouplock.cg_gid);
1785 spin_unlock(&lli->lli_lock);
1789 grouplock = fd->fd_grouplock;
1790 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1791 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1792 spin_unlock(&lli->lli_lock);
1794 cl_put_grouplock(&grouplock);
1795 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1800 * Close inode open handle
1802 * \param dentry [in] dentry which contains the inode
1803 * \param it [in,out] intent which contains open info and result
1806 * \retval <0 failure
1808 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1810 struct inode *inode = dentry->d_inode;
1811 struct obd_client_handle *och;
1817 /* Root ? Do nothing. */
1818 if (dentry->d_inode->i_sb->s_root == dentry)
1821 /* No open handle to close? Move away */
1822 if (!it_disposition(it, DISP_OPEN_OPEN))
1825 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1827 OBD_ALLOC(och, sizeof(*och));
1829 GOTO(out, rc = -ENOMEM);
1831 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1833 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1836 /* this one is in place of ll_file_open */
1837 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1838 ptlrpc_req_finished(it->d.lustre.it_data);
1839 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1845 * Get size for inode for which FIEMAP mapping is requested.
1846 * Make the FIEMAP get_info call and returns the result.
1848 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1851 struct obd_export *exp = ll_i2dtexp(inode);
1852 struct lov_stripe_md *lsm = NULL;
1853 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1854 __u32 vallen = num_bytes;
1858 /* Checks for fiemap flags */
1859 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1860 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1864 /* Check for FIEMAP_FLAG_SYNC */
1865 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1866 rc = filemap_fdatawrite(inode->i_mapping);
1871 lsm = ccc_inode_lsm_get(inode);
1875 /* If the stripe_count > 1 and the application does not understand
1876 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1878 if (lsm->lsm_stripe_count > 1 &&
1879 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1880 GOTO(out, rc = -EOPNOTSUPP);
1882 fm_key.oa.o_oi = lsm->lsm_oi;
1883 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1885 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1886 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1887 /* If filesize is 0, then there would be no objects for mapping */
1888 if (fm_key.oa.o_size == 0) {
1889 fiemap->fm_mapped_extents = 0;
1893 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1895 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1898 CERROR("obd_get_info failed: rc = %d\n", rc);
1901 ccc_inode_lsm_put(inode, lsm);
1905 int ll_fid2path(struct inode *inode, void __user *arg)
1907 struct obd_export *exp = ll_i2mdexp(inode);
1908 const struct getinfo_fid2path __user *gfin = arg;
1910 struct getinfo_fid2path *gfout;
1916 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1917 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1920 /* Only need to get the buflen */
1921 if (get_user(pathlen, &gfin->gf_pathlen))
1924 if (pathlen > PATH_MAX)
1927 outsize = sizeof(*gfout) + pathlen;
1928 OBD_ALLOC(gfout, outsize);
1932 if (copy_from_user(gfout, arg, sizeof(*gfout)))
1933 GOTO(gf_free, rc = -EFAULT);
1935 /* Call mdc_iocontrol */
1936 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1940 if (copy_to_user(arg, gfout, outsize))
1944 OBD_FREE(gfout, outsize);
1948 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1950 struct ll_user_fiemap *fiemap_s;
1951 size_t num_bytes, ret_bytes;
1952 unsigned int extent_count;
1955 /* Get the extent count so we can calculate the size of
1956 * required fiemap buffer */
1957 if (get_user(extent_count,
1958 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1962 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1964 num_bytes = sizeof(*fiemap_s) + (extent_count *
1965 sizeof(struct ll_fiemap_extent));
1967 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1968 if (fiemap_s == NULL)
1971 /* get the fiemap value */
1972 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1974 GOTO(error, rc = -EFAULT);
1976 /* If fm_extent_count is non-zero, read the first extent since
1977 * it is used to calculate end_offset and device from previous
1980 if (copy_from_user(&fiemap_s->fm_extents[0],
1981 (char __user *)arg + sizeof(*fiemap_s),
1982 sizeof(struct ll_fiemap_extent)))
1983 GOTO(error, rc = -EFAULT);
1986 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1990 ret_bytes = sizeof(struct ll_user_fiemap);
1992 if (extent_count != 0)
1993 ret_bytes += (fiemap_s->fm_mapped_extents *
1994 sizeof(struct ll_fiemap_extent));
1996 if (copy_to_user((void __user *)arg, fiemap_s, ret_bytes))
2000 OBD_FREE_LARGE(fiemap_s, num_bytes);
2005 * Read the data_version for inode.
2007 * This value is computed using stripe object version on OST.
2008 * Version is computed using server side locking.
2010 * @param sync if do sync on the OST side;
2012 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2013 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2015 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2017 struct lov_stripe_md *lsm = NULL;
2018 struct ll_sb_info *sbi = ll_i2sbi(inode);
2019 struct obdo *obdo = NULL;
2023 /* If no stripe, we consider version is 0. */
2024 lsm = ccc_inode_lsm_get(inode);
2025 if (!lsm_has_objects(lsm)) {
2027 CDEBUG(D_INODE, "No object for inode\n");
2031 OBD_ALLOC_PTR(obdo);
2033 GOTO(out, rc = -ENOMEM);
2035 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, flags);
2037 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
2040 *data_version = obdo->o_data_version;
2046 ccc_inode_lsm_put(inode, lsm);
2051 * Trigger a HSM release request for the provided inode.
2053 int ll_hsm_release(struct inode *inode)
2055 struct cl_env_nest nest;
2057 struct obd_client_handle *och = NULL;
2058 __u64 data_version = 0;
2062 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2063 ll_get_fsname(inode->i_sb, NULL, 0),
2064 PFID(&ll_i2info(inode)->lli_fid));
2066 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2068 GOTO(out, rc = PTR_ERR(och));
2070 /* Grab latest data_version and [am]time values */
2071 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2075 env = cl_env_nested_get(&nest);
2077 GOTO(out, rc = PTR_ERR(env));
2079 ll_merge_lvb(env, inode);
2080 cl_env_nested_put(&nest, env);
2082 /* Release the file.
2083 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2084 * we still need it to pack l_remote_handle to MDT. */
2085 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
2091 if (och != NULL && !IS_ERR(och)) /* close the file */
2092 ll_lease_close(och, inode, NULL);
2097 struct ll_swap_stack {
2098 struct iattr ia1, ia2;
2100 struct inode *inode1, *inode2;
2101 bool check_dv1, check_dv2;
2104 static int ll_swap_layouts(struct file *file1, struct file *file2,
2105 struct lustre_swap_layouts *lsl)
2107 struct mdc_swap_layouts msl;
2108 struct md_op_data *op_data;
2111 struct ll_swap_stack *llss = NULL;
2114 OBD_ALLOC_PTR(llss);
2118 llss->inode1 = file1->f_dentry->d_inode;
2119 llss->inode2 = file2->f_dentry->d_inode;
2121 if (!S_ISREG(llss->inode2->i_mode))
2122 GOTO(free, rc = -EINVAL);
2124 if (inode_permission(llss->inode1, MAY_WRITE) ||
2125 inode_permission(llss->inode2, MAY_WRITE))
2126 GOTO(free, rc = -EPERM);
2128 if (llss->inode2->i_sb != llss->inode1->i_sb)
2129 GOTO(free, rc = -EXDEV);
2131 /* we use 2 bool because it is easier to swap than 2 bits */
2132 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2133 llss->check_dv1 = true;
2135 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2136 llss->check_dv2 = true;
2138 /* we cannot use lsl->sl_dvX directly because we may swap them */
2139 llss->dv1 = lsl->sl_dv1;
2140 llss->dv2 = lsl->sl_dv2;
2142 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2143 if (rc == 0) /* same file, done! */
2146 if (rc < 0) { /* sequentialize it */
2147 swap(llss->inode1, llss->inode2);
2149 swap(llss->dv1, llss->dv2);
2150 swap(llss->check_dv1, llss->check_dv2);
2154 if (gid != 0) { /* application asks to flush dirty cache */
2155 rc = ll_get_grouplock(llss->inode1, file1, gid);
2159 rc = ll_get_grouplock(llss->inode2, file2, gid);
2161 ll_put_grouplock(llss->inode1, file1, gid);
2166 /* to be able to restore mtime and atime after swap
2167 * we need to first save them */
2169 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2170 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2171 llss->ia1.ia_atime = llss->inode1->i_atime;
2172 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2173 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2174 llss->ia2.ia_atime = llss->inode2->i_atime;
2175 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2178 /* ultimate check, before swaping the layouts we check if
2179 * dataversion has changed (if requested) */
2180 if (llss->check_dv1) {
2181 rc = ll_data_version(llss->inode1, &dv, 0);
2184 if (dv != llss->dv1)
2185 GOTO(putgl, rc = -EAGAIN);
2188 if (llss->check_dv2) {
2189 rc = ll_data_version(llss->inode2, &dv, 0);
2192 if (dv != llss->dv2)
2193 GOTO(putgl, rc = -EAGAIN);
2196 /* struct md_op_data is used to send the swap args to the mdt
2197 * only flags is missing, so we use struct mdc_swap_layouts
2198 * through the md_op_data->op_data */
2199 /* flags from user space have to be converted before they are send to
2200 * server, no flag is sent today, they are only used on the client */
2203 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2204 0, LUSTRE_OPC_ANY, &msl);
2205 if (IS_ERR(op_data))
2206 GOTO(free, rc = PTR_ERR(op_data));
2208 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2209 sizeof(*op_data), op_data, NULL);
2210 ll_finish_md_op_data(op_data);
2214 ll_put_grouplock(llss->inode2, file2, gid);
2215 ll_put_grouplock(llss->inode1, file1, gid);
2218 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2222 /* clear useless flags */
2223 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2224 llss->ia1.ia_valid &= ~ATTR_MTIME;
2225 llss->ia2.ia_valid &= ~ATTR_MTIME;
2228 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2229 llss->ia1.ia_valid &= ~ATTR_ATIME;
2230 llss->ia2.ia_valid &= ~ATTR_ATIME;
2233 /* update time if requested */
2235 if (llss->ia2.ia_valid != 0) {
2236 mutex_lock(&llss->inode1->i_mutex);
2237 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2238 mutex_unlock(&llss->inode1->i_mutex);
2241 if (llss->ia1.ia_valid != 0) {
2244 mutex_lock(&llss->inode2->i_mutex);
2245 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2246 mutex_unlock(&llss->inode2->i_mutex);
2258 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2260 struct md_op_data *op_data;
2263 /* Non-root users are forbidden to set or clear flags which are
2264 * NOT defined in HSM_USER_MASK. */
2265 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2266 !cfs_capable(CFS_CAP_SYS_ADMIN))
2269 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2270 LUSTRE_OPC_ANY, hss);
2271 if (IS_ERR(op_data))
2272 RETURN(PTR_ERR(op_data));
2274 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2275 sizeof(*op_data), op_data, NULL);
2277 ll_finish_md_op_data(op_data);
2282 static int ll_hsm_import(struct inode *inode, struct file *file,
2283 struct hsm_user_import *hui)
2285 struct hsm_state_set *hss = NULL;
2286 struct iattr *attr = NULL;
2290 if (!S_ISREG(inode->i_mode))
2296 GOTO(out, rc = -ENOMEM);
2298 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2299 hss->hss_archive_id = hui->hui_archive_id;
2300 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2301 rc = ll_hsm_state_set(inode, hss);
2305 OBD_ALLOC_PTR(attr);
2307 GOTO(out, rc = -ENOMEM);
2309 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2310 attr->ia_mode |= S_IFREG;
2311 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2312 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2313 attr->ia_size = hui->hui_size;
2314 attr->ia_mtime.tv_sec = hui->hui_mtime;
2315 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2316 attr->ia_atime.tv_sec = hui->hui_atime;
2317 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2319 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2320 ATTR_UID | ATTR_GID |
2321 ATTR_MTIME | ATTR_MTIME_SET |
2322 ATTR_ATIME | ATTR_ATIME_SET;
2324 mutex_lock(&inode->i_mutex);
2326 rc = ll_setattr_raw(file->f_dentry, attr, true);
2330 mutex_unlock(&inode->i_mutex);
2342 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2344 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2345 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2349 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2351 struct inode *inode = file->f_dentry->d_inode;
2352 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2356 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
2357 PFID(ll_inode2fid(inode)), inode, cmd);
2358 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2360 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2361 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2365 case LL_IOC_GETFLAGS:
2366 /* Get the current value of the file flags */
2367 return put_user(fd->fd_flags, (int __user *)arg);
2368 case LL_IOC_SETFLAGS:
2369 case LL_IOC_CLRFLAGS:
2370 /* Set or clear specific file flags */
2371 /* XXX This probably needs checks to ensure the flags are
2372 * not abused, and to handle any flag side effects.
2374 if (get_user(flags, (int __user *) arg))
2377 if (cmd == LL_IOC_SETFLAGS) {
2378 if ((flags & LL_FILE_IGNORE_LOCK) &&
2379 !(file->f_flags & O_DIRECT)) {
2380 CERROR("%s: unable to disable locking on "
2381 "non-O_DIRECT file\n", current->comm);
2385 fd->fd_flags |= flags;
2387 fd->fd_flags &= ~flags;
2390 case LL_IOC_LOV_SETSTRIPE:
2391 RETURN(ll_lov_setstripe(inode, file, arg));
2392 case LL_IOC_LOV_SETEA:
2393 RETURN(ll_lov_setea(inode, file, arg));
2394 case LL_IOC_LOV_SWAP_LAYOUTS: {
2396 struct lustre_swap_layouts lsl;
2398 if (copy_from_user(&lsl, (char __user *)arg,
2399 sizeof(struct lustre_swap_layouts)))
2402 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2405 file2 = fget(lsl.sl_fd);
2410 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2411 rc = ll_swap_layouts(file, file2, &lsl);
2415 case LL_IOC_LOV_GETSTRIPE:
2416 RETURN(ll_lov_getstripe(inode, arg));
2417 case LL_IOC_RECREATE_OBJ:
2418 RETURN(ll_lov_recreate_obj(inode, arg));
2419 case LL_IOC_RECREATE_FID:
2420 RETURN(ll_lov_recreate_fid(inode, arg));
2421 case FSFILT_IOC_FIEMAP:
2422 RETURN(ll_ioctl_fiemap(inode, arg));
2423 case FSFILT_IOC_GETFLAGS:
2424 case FSFILT_IOC_SETFLAGS:
2425 RETURN(ll_iocontrol(inode, file, cmd, arg));
2426 case FSFILT_IOC_GETVERSION_OLD:
2427 case FSFILT_IOC_GETVERSION:
2428 RETURN(put_user(inode->i_generation, (int __user *)arg));
2429 case LL_IOC_GROUP_LOCK:
2430 RETURN(ll_get_grouplock(inode, file, arg));
2431 case LL_IOC_GROUP_UNLOCK:
2432 RETURN(ll_put_grouplock(inode, file, arg));
2433 case IOC_OBD_STATFS:
2434 RETURN(ll_obd_statfs(inode, (void __user *)arg));
2436 /* We need to special case any other ioctls we want to handle,
2437 * to send them to the MDS/OST as appropriate and to properly
2438 * network encode the arg field.
2439 case FSFILT_IOC_SETVERSION_OLD:
2440 case FSFILT_IOC_SETVERSION:
2442 case LL_IOC_FLUSHCTX:
2443 RETURN(ll_flush_ctx(inode));
2444 case LL_IOC_PATH2FID: {
2445 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
2446 sizeof(struct lu_fid)))
2451 case OBD_IOC_FID2PATH:
2452 RETURN(ll_fid2path(inode, (void __user *)arg));
2453 case LL_IOC_DATA_VERSION: {
2454 struct ioc_data_version idv;
2457 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
2460 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
2461 rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
2464 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
2470 case LL_IOC_GET_MDTIDX: {
2473 mdtidx = ll_get_mdt_idx(inode);
2477 if (put_user((int)mdtidx, (int __user *)arg))
2482 case OBD_IOC_GETDTNAME:
2483 case OBD_IOC_GETMDNAME:
2484 RETURN(ll_get_obd_name(inode, cmd, arg));
2485 case LL_IOC_HSM_STATE_GET: {
2486 struct md_op_data *op_data;
2487 struct hsm_user_state *hus;
2494 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2495 LUSTRE_OPC_ANY, hus);
2496 if (IS_ERR(op_data)) {
2498 RETURN(PTR_ERR(op_data));
2501 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2504 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
2507 ll_finish_md_op_data(op_data);
2511 case LL_IOC_HSM_STATE_SET: {
2512 struct hsm_state_set *hss;
2519 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
2524 rc = ll_hsm_state_set(inode, hss);
2529 case LL_IOC_HSM_ACTION: {
2530 struct md_op_data *op_data;
2531 struct hsm_current_action *hca;
2538 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2539 LUSTRE_OPC_ANY, hca);
2540 if (IS_ERR(op_data)) {
2542 RETURN(PTR_ERR(op_data));
2545 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2548 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
2551 ll_finish_md_op_data(op_data);
2555 case LL_IOC_SET_LEASE: {
2556 struct ll_inode_info *lli = ll_i2info(inode);
2557 struct obd_client_handle *och = NULL;
2562 case LL_LEASE_WRLCK:
2563 if (!(file->f_mode & FMODE_WRITE))
2565 fmode = FMODE_WRITE;
2567 case LL_LEASE_RDLCK:
2568 if (!(file->f_mode & FMODE_READ))
2572 case LL_LEASE_UNLCK:
2573 mutex_lock(&lli->lli_och_mutex);
2574 if (fd->fd_lease_och != NULL) {
2575 och = fd->fd_lease_och;
2576 fd->fd_lease_och = NULL;
2578 mutex_unlock(&lli->lli_och_mutex);
2583 fmode = och->och_flags;
2584 rc = ll_lease_close(och, inode, &lease_broken);
2591 RETURN(ll_lease_type_from_fmode(fmode));
2596 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
2598 /* apply for lease */
2599 och = ll_lease_open(inode, file, fmode, 0);
2601 RETURN(PTR_ERR(och));
2604 mutex_lock(&lli->lli_och_mutex);
2605 if (fd->fd_lease_och == NULL) {
2606 fd->fd_lease_och = och;
2609 mutex_unlock(&lli->lli_och_mutex);
2611 /* impossible now that only excl is supported for now */
2612 ll_lease_close(och, inode, &lease_broken);
2617 case LL_IOC_GET_LEASE: {
2618 struct ll_inode_info *lli = ll_i2info(inode);
2619 struct ldlm_lock *lock = NULL;
2622 mutex_lock(&lli->lli_och_mutex);
2623 if (fd->fd_lease_och != NULL) {
2624 struct obd_client_handle *och = fd->fd_lease_och;
2626 lock = ldlm_handle2lock(&och->och_lease_handle);
2628 lock_res_and_lock(lock);
2629 if (!ldlm_is_cancel(lock))
2630 fmode = och->och_flags;
2632 unlock_res_and_lock(lock);
2633 LDLM_LOCK_PUT(lock);
2636 mutex_unlock(&lli->lli_och_mutex);
2638 RETURN(ll_lease_type_from_fmode(fmode));
2640 case LL_IOC_HSM_IMPORT: {
2641 struct hsm_user_import *hui;
2647 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
2652 rc = ll_hsm_import(inode, file, hui);
2662 ll_iocontrol_call(inode, file, cmd, arg, &err))
2665 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2666 (void __user *)arg));
2671 #ifndef HAVE_FILE_LLSEEK_SIZE
2672 static inline loff_t
2673 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2675 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2677 if (offset > maxsize)
2680 if (offset != file->f_pos) {
2681 file->f_pos = offset;
2682 file->f_version = 0;
2688 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2689 loff_t maxsize, loff_t eof)
2691 struct inode *inode = file->f_dentry->d_inode;
2699 * Here we special-case the lseek(fd, 0, SEEK_CUR)
2700 * position-querying operation. Avoid rewriting the "same"
2701 * f_pos value back to the file because a concurrent read(),
2702 * write() or lseek() might have altered it
2707 * f_lock protects against read/modify/write race with other
2708 * SEEK_CURs. Note that parallel writes and reads behave
2711 mutex_lock(&inode->i_mutex);
2712 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2713 mutex_unlock(&inode->i_mutex);
2717 * In the generic case the entire file is data, so as long as
2718 * offset isn't at the end of the file then the offset is data.
2725 * There is a virtual hole at the end of the file, so as long as
2726 * offset isn't i_size or larger, return i_size.
2734 return llseek_execute(file, offset, maxsize);
2738 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2740 struct inode *inode = file->f_dentry->d_inode;
2741 loff_t retval, eof = 0;
2744 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2745 (origin == SEEK_CUR) ? file->f_pos : 0);
2746 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
2747 PFID(ll_inode2fid(inode)), inode, retval, retval,
2749 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2751 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2752 retval = ll_glimpse_size(inode);
2755 eof = i_size_read(inode);
2758 retval = ll_generic_file_llseek_size(file, offset, origin,
2759 ll_file_maxbytes(inode), eof);
2763 static int ll_flush(struct file *file, fl_owner_t id)
2765 struct inode *inode = file->f_dentry->d_inode;
2766 struct ll_inode_info *lli = ll_i2info(inode);
2767 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2770 LASSERT(!S_ISDIR(inode->i_mode));
2772 /* catch async errors that were recorded back when async writeback
2773 * failed for pages in this mapping. */
2774 rc = lli->lli_async_rc;
2775 lli->lli_async_rc = 0;
2776 if (lli->lli_clob != NULL) {
2777 err = lov_read_and_clear_async_rc(lli->lli_clob);
2782 /* The application has been told write failure already.
2783 * Do not report failure again. */
2784 if (fd->fd_write_failed)
2786 return rc ? -EIO : 0;
2790 * Called to make sure a portion of file has been written out.
2791 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2793 * Return how many pages have been written.
2795 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2796 enum cl_fsync_mode mode, int ignore_layout)
2798 struct cl_env_nest nest;
2801 struct obd_capa *capa = NULL;
2802 struct cl_fsync_io *fio;
2806 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2807 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2810 env = cl_env_nested_get(&nest);
2812 RETURN(PTR_ERR(env));
2814 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2816 io = ccc_env_thread_io(env);
2817 io->ci_obj = cl_i2info(inode)->lli_clob;
2818 io->ci_ignore_layout = ignore_layout;
2820 /* initialize parameters for sync */
2821 fio = &io->u.ci_fsync;
2822 fio->fi_capa = capa;
2823 fio->fi_start = start;
2825 fio->fi_fid = ll_inode2fid(inode);
2826 fio->fi_mode = mode;
2827 fio->fi_nr_written = 0;
2829 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2830 result = cl_io_loop(env, io);
2832 result = io->ci_result;
2834 result = fio->fi_nr_written;
2835 cl_io_fini(env, io);
2836 cl_env_nested_put(&nest, env);
2844 * When dentry is provided (the 'else' case), *file->f_dentry may be
2845 * null and dentry must be used directly rather than pulled from
2846 * *file->f_dentry as is done otherwise.
2849 #ifdef HAVE_FILE_FSYNC_4ARGS
2850 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2852 struct dentry *dentry = file->f_dentry;
2853 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2854 int ll_fsync(struct file *file, int datasync)
2856 struct dentry *dentry = file->f_dentry;
2858 loff_t end = LLONG_MAX;
2860 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2863 loff_t end = LLONG_MAX;
2865 struct inode *inode = dentry->d_inode;
2866 struct ll_inode_info *lli = ll_i2info(inode);
2867 struct ptlrpc_request *req;
2868 struct obd_capa *oc;
2872 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
2873 PFID(ll_inode2fid(inode)), inode);
2874 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2876 #ifdef HAVE_FILE_FSYNC_4ARGS
2877 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2878 mutex_lock(&inode->i_mutex);
2880 /* fsync's caller has already called _fdata{sync,write}, we want
2881 * that IO to finish before calling the osc and mdc sync methods */
2882 rc = filemap_fdatawait(inode->i_mapping);
2885 /* catch async errors that were recorded back when async writeback
2886 * failed for pages in this mapping. */
2887 if (!S_ISDIR(inode->i_mode)) {
2888 err = lli->lli_async_rc;
2889 lli->lli_async_rc = 0;
2892 err = lov_read_and_clear_async_rc(lli->lli_clob);
2897 oc = ll_mdscapa_get(inode);
2898 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2904 ptlrpc_req_finished(req);
2906 if (S_ISREG(inode->i_mode)) {
2907 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2909 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2910 if (rc == 0 && err < 0)
2913 fd->fd_write_failed = true;
2915 fd->fd_write_failed = false;
2918 #ifdef HAVE_FILE_FSYNC_4ARGS
2919 mutex_unlock(&inode->i_mutex);
2925 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2927 struct inode *inode = file->f_dentry->d_inode;
2928 struct ll_sb_info *sbi = ll_i2sbi(inode);
2929 struct ldlm_enqueue_info einfo = {
2930 .ei_type = LDLM_FLOCK,
2931 .ei_cb_cp = ldlm_flock_completion_ast,
2932 .ei_cbdata = file_lock,
2934 struct md_op_data *op_data;
2935 struct lustre_handle lockh = {0};
2936 ldlm_policy_data_t flock = {{0}};
2937 int fl_type = file_lock->fl_type;
2943 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
2944 PFID(ll_inode2fid(inode)), file_lock);
2946 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2948 if (file_lock->fl_flags & FL_FLOCK) {
2949 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2950 /* flocks are whole-file locks */
2951 flock.l_flock.end = OFFSET_MAX;
2952 /* For flocks owner is determined by the local file desctiptor*/
2953 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2954 } else if (file_lock->fl_flags & FL_POSIX) {
2955 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2956 flock.l_flock.start = file_lock->fl_start;
2957 flock.l_flock.end = file_lock->fl_end;
2961 flock.l_flock.pid = file_lock->fl_pid;
2963 /* Somewhat ugly workaround for svc lockd.
2964 * lockd installs custom fl_lmops->lm_compare_owner that checks
2965 * for the fl_owner to be the same (which it always is on local node
2966 * I guess between lockd processes) and then compares pid.
2967 * As such we assign pid to the owner field to make it all work,
2968 * conflict with normal locks is unlikely since pid space and
2969 * pointer space for current->files are not intersecting */
2970 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2971 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2975 einfo.ei_mode = LCK_PR;
2978 /* An unlock request may or may not have any relation to
2979 * existing locks so we may not be able to pass a lock handle
2980 * via a normal ldlm_lock_cancel() request. The request may even
2981 * unlock a byte range in the middle of an existing lock. In
2982 * order to process an unlock request we need all of the same
2983 * information that is given with a normal read or write record
2984 * lock request. To avoid creating another ldlm unlock (cancel)
2985 * message we'll treat a LCK_NL flock request as an unlock. */
2986 einfo.ei_mode = LCK_NL;
2989 einfo.ei_mode = LCK_PW;
2992 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3007 flags = LDLM_FL_BLOCK_NOWAIT;
3013 flags = LDLM_FL_TEST_LOCK;
3016 CERROR("unknown fcntl lock command: %d\n", cmd);
3020 /* Save the old mode so that if the mode in the lock changes we
3021 * can decrement the appropriate reader or writer refcount. */
3022 file_lock->fl_type = einfo.ei_mode;
3024 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3025 LUSTRE_OPC_ANY, NULL);
3026 if (IS_ERR(op_data))
3027 RETURN(PTR_ERR(op_data));
3029 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags="LPX64", mode=%u, "
3030 "start="LPU64", end="LPU64"\n", PFID(ll_inode2fid(inode)),
3031 flock.l_flock.pid, flags, einfo.ei_mode,
3032 flock.l_flock.start, flock.l_flock.end);
3034 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data, &lockh,
3037 /* Restore the file lock type if not TEST lock. */
3038 if (!(flags & LDLM_FL_TEST_LOCK))
3039 file_lock->fl_type = fl_type;
3041 if ((file_lock->fl_flags & FL_FLOCK) &&
3042 (rc == 0 || file_lock->fl_type == F_UNLCK))
3043 rc2 = flock_lock_file_wait(file, file_lock);
3044 if ((file_lock->fl_flags & FL_POSIX) &&
3045 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3046 !(flags & LDLM_FL_TEST_LOCK))
3047 rc2 = posix_lock_file_wait(file, file_lock);
3049 if (rc2 && file_lock->fl_type != F_UNLCK) {
3050 einfo.ei_mode = LCK_NL;
3051 md_enqueue(sbi->ll_md_exp, &einfo, &flock, NULL, op_data,
3056 ll_finish_md_op_data(op_data);
3061 int ll_get_fid_by_name(struct inode *parent, const char *name,
3062 int namelen, struct lu_fid *fid)
3064 struct md_op_data *op_data = NULL;
3065 struct mdt_body *body;
3066 struct ptlrpc_request *req;
3070 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3071 LUSTRE_OPC_ANY, NULL);
3072 if (IS_ERR(op_data))
3073 RETURN(PTR_ERR(op_data));
3075 op_data->op_valid = OBD_MD_FLID;
3076 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3077 ll_finish_md_op_data(op_data);
3081 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3083 GOTO(out_req, rc = -EFAULT);
3085 *fid = body->mbo_fid1;
3087 ptlrpc_req_finished(req);
3091 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3092 const char *name, int namelen)
3094 struct dentry *dchild = NULL;
3095 struct inode *child_inode = NULL;
3096 struct md_op_data *op_data;
3097 struct ptlrpc_request *request = NULL;
3102 CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3103 name, PFID(ll_inode2fid(parent)), mdtidx);
3105 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3106 0, LUSTRE_OPC_ANY, NULL);
3107 if (IS_ERR(op_data))
3108 RETURN(PTR_ERR(op_data));
3110 /* Get child FID first */
3111 qstr.hash = full_name_hash(name, namelen);
3114 dchild = d_lookup(file->f_dentry, &qstr);
3115 if (dchild != NULL && dchild->d_inode != NULL) {
3116 op_data->op_fid3 = *ll_inode2fid(dchild->d_inode);
3117 if (dchild->d_inode != NULL) {
3118 child_inode = igrab(dchild->d_inode);
3119 ll_invalidate_aliases(child_inode);
3123 rc = ll_get_fid_by_name(parent, name, namelen,
3129 if (!fid_is_sane(&op_data->op_fid3)) {
3130 CERROR("%s: migrate %s , but fid "DFID" is insane\n",
3131 ll_get_fsname(parent->i_sb, NULL, 0), name,
3132 PFID(&op_data->op_fid3));
3133 GOTO(out_free, rc = -EINVAL);
3136 rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3141 CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
3142 PFID(&op_data->op_fid3), mdtidx);
3143 GOTO(out_free, rc = 0);
3146 op_data->op_mds = mdtidx;
3147 op_data->op_cli_flags = CLI_MIGRATE;
3148 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3149 namelen, name, namelen, &request);
3151 ll_update_times(request, parent);
3153 ptlrpc_req_finished(request);
3158 if (child_inode != NULL) {
3159 clear_nlink(child_inode);
3163 ll_finish_md_op_data(op_data);
3168 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3176 * test if some locks matching bits and l_req_mode are acquired
3177 * - bits can be in different locks
3178 * - if found clear the common lock bits in *bits
3179 * - the bits not found, are kept in *bits
3181 * \param bits [IN] searched lock bits [IN]
3182 * \param l_req_mode [IN] searched lock mode
3183 * \retval boolean, true iff all bits are found
3185 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
3187 struct lustre_handle lockh;
3188 ldlm_policy_data_t policy;
3189 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3190 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3199 fid = &ll_i2info(inode)->lli_fid;
3200 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
3201 ldlm_lockname[mode]);
3203 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3204 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
3205 policy.l_inodebits.bits = *bits & (1 << i);
3206 if (policy.l_inodebits.bits == 0)
3209 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
3210 &policy, mode, &lockh)) {
3211 struct ldlm_lock *lock;
3213 lock = ldlm_handle2lock(&lockh);
3216 ~(lock->l_policy_data.l_inodebits.bits);
3217 LDLM_LOCK_PUT(lock);
3219 *bits &= ~policy.l_inodebits.bits;
3226 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
3227 struct lustre_handle *lockh, __u64 flags,
3230 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3235 fid = &ll_i2info(inode)->lli_fid;
3236 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
3238 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
3239 fid, LDLM_IBITS, &policy, mode, lockh);
3244 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
3246 /* Already unlinked. Just update nlink and return success */
3247 if (rc == -ENOENT) {
3249 /* This path cannot be hit for regular files unless in
3250 * case of obscure races, so no need to to validate
3252 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
3254 } else if (rc != 0) {
3255 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
3256 "%s: revalidate FID "DFID" error: rc = %d\n",
3257 ll_get_fsname(inode->i_sb, NULL, 0),
3258 PFID(ll_inode2fid(inode)), rc);
3264 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3266 struct inode *inode = dentry->d_inode;
3267 struct ptlrpc_request *req = NULL;
3268 struct obd_export *exp;
3272 LASSERT(inode != NULL);
3274 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
3275 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
3277 exp = ll_i2mdexp(inode);
3279 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
3280 * But under CMD case, it caused some lock issues, should be fixed
3281 * with new CMD ibits lock. See bug 12718 */
3282 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
3283 struct lookup_intent oit = { .it_op = IT_GETATTR };
3284 struct md_op_data *op_data;
3286 if (ibits == MDS_INODELOCK_LOOKUP)
3287 oit.it_op = IT_LOOKUP;
3289 /* Call getattr by fid, so do not provide name at all. */
3290 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
3291 dentry->d_inode, NULL, 0, 0,
3292 LUSTRE_OPC_ANY, NULL);
3293 if (IS_ERR(op_data))
3294 RETURN(PTR_ERR(op_data));
3296 rc = md_intent_lock(exp, op_data, &oit, &req,
3297 &ll_md_blocking_ast, 0);
3298 ll_finish_md_op_data(op_data);
3300 rc = ll_inode_revalidate_fini(inode, rc);
3304 rc = ll_revalidate_it_finish(req, &oit, dentry);
3306 ll_intent_release(&oit);
3310 /* Unlinked? Unhash dentry, so it is not picked up later by
3311 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3312 here to preserve get_cwd functionality on 2.6.
3314 if (!dentry->d_inode->i_nlink)
3315 d_lustre_invalidate(dentry, 0);
3317 ll_lookup_finish_locks(&oit, dentry);
3318 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
3319 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3320 obd_valid valid = OBD_MD_FLGETATTR;
3321 struct md_op_data *op_data;
3324 if (S_ISREG(inode->i_mode)) {
3325 rc = ll_get_default_mdsize(sbi, &ealen);
3328 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3331 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
3332 0, ealen, LUSTRE_OPC_ANY,
3334 if (IS_ERR(op_data))
3335 RETURN(PTR_ERR(op_data));
3337 op_data->op_valid = valid;
3338 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3339 * capa for this inode. Because we only keep capas of dirs
3341 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
3342 ll_finish_md_op_data(op_data);
3344 rc = ll_inode_revalidate_fini(inode, rc);
3348 rc = ll_prep_inode(&inode, req, NULL, NULL);
3351 ptlrpc_req_finished(req);
3355 static int ll_merge_md_attr(struct inode *inode)
3357 struct cl_attr attr = { 0 };
3360 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
3361 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
3366 ll_i2info(inode)->lli_stripe_dir_size = attr.cat_size;
3367 ll_i2info(inode)->lli_stripe_dir_nlink = attr.cat_nlink;
3369 ll_i2info(inode)->lli_lvb.lvb_atime = attr.cat_atime;
3370 ll_i2info(inode)->lli_lvb.lvb_mtime = attr.cat_mtime;
3371 ll_i2info(inode)->lli_lvb.lvb_ctime = attr.cat_ctime;
3377 ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
3379 struct inode *inode = dentry->d_inode;
3383 rc = __ll_inode_revalidate(dentry, ibits);
3387 /* if object isn't regular file, don't validate size */
3388 if (!S_ISREG(inode->i_mode)) {
3389 if (S_ISDIR(inode->i_mode) &&
3390 ll_i2info(inode)->lli_lsm_md != NULL) {
3391 rc = ll_merge_md_attr(inode);
3396 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3397 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3398 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3400 /* In case of restore, the MDT has the right size and has
3401 * already send it back without granting the layout lock,
3402 * inode is up-to-date so glimpse is useless.
3403 * Also to glimpse we need the layout, in case of a running
3404 * restore the MDT holds the layout lock so the glimpse will
3405 * block up to the end of restore (getattr will block)
3407 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3408 rc = ll_glimpse_size(inode);
3413 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3415 struct inode *inode = de->d_inode;
3416 struct ll_sb_info *sbi = ll_i2sbi(inode);
3417 struct ll_inode_info *lli = ll_i2info(inode);
3420 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3421 MDS_INODELOCK_LOOKUP);
3422 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3427 stat->dev = inode->i_sb->s_dev;
3428 if (ll_need_32bit_api(sbi))
3429 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3431 stat->ino = inode->i_ino;
3432 stat->mode = inode->i_mode;
3433 stat->uid = inode->i_uid;
3434 stat->gid = inode->i_gid;
3435 stat->rdev = inode->i_rdev;
3436 stat->atime = inode->i_atime;
3437 stat->mtime = inode->i_mtime;
3438 stat->ctime = inode->i_ctime;
3439 stat->blksize = 1 << inode->i_blkbits;
3440 stat->blocks = inode->i_blocks;
3442 if (S_ISDIR(inode->i_mode) &&
3443 ll_i2info(inode)->lli_lsm_md != NULL) {
3444 stat->nlink = lli->lli_stripe_dir_nlink;
3445 stat->size = lli->lli_stripe_dir_size;
3447 stat->nlink = inode->i_nlink;
3448 stat->size = i_size_read(inode);
3454 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3455 __u64 start, __u64 len)
3459 struct ll_user_fiemap *fiemap;
3460 unsigned int extent_count = fieinfo->fi_extents_max;
3462 num_bytes = sizeof(*fiemap) + (extent_count *
3463 sizeof(struct ll_fiemap_extent));
3464 OBD_ALLOC_LARGE(fiemap, num_bytes);
3469 fiemap->fm_flags = fieinfo->fi_flags;
3470 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3471 fiemap->fm_start = start;
3472 fiemap->fm_length = len;
3473 if (extent_count > 0)
3474 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3475 sizeof(struct ll_fiemap_extent));
3477 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3479 fieinfo->fi_flags = fiemap->fm_flags;
3480 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3481 if (extent_count > 0)
3482 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3483 fiemap->fm_mapped_extents *
3484 sizeof(struct ll_fiemap_extent));
3486 OBD_FREE_LARGE(fiemap, num_bytes);
3490 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3492 struct ll_inode_info *lli = ll_i2info(inode);
3493 struct posix_acl *acl = NULL;
3496 spin_lock(&lli->lli_lock);
3497 /* VFS' acl_permission_check->check_acl will release the refcount */
3498 acl = posix_acl_dup(lli->lli_posix_acl);
3499 spin_unlock(&lli->lli_lock);
3504 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3506 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3507 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3509 ll_check_acl(struct inode *inode, int mask)
3512 # ifdef CONFIG_FS_POSIX_ACL
3513 struct posix_acl *acl;
3517 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3518 if (flags & IPERM_FLAG_RCU)
3521 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3526 rc = posix_acl_permission(inode, acl, mask);
3527 posix_acl_release(acl);
3530 # else /* !CONFIG_FS_POSIX_ACL */
3532 # endif /* CONFIG_FS_POSIX_ACL */
3534 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3536 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3537 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3539 # ifdef HAVE_INODE_PERMISION_2ARGS
3540 int ll_inode_permission(struct inode *inode, int mask)
3542 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3547 struct ll_sb_info *sbi;
3548 struct root_squash_info *squash;
3549 struct cred *cred = NULL;
3550 const struct cred *old_cred = NULL;
3552 bool squash_id = false;
3555 #ifdef MAY_NOT_BLOCK
3556 if (mask & MAY_NOT_BLOCK)
3558 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3559 if (flags & IPERM_FLAG_RCU)
3563 /* as root inode are NOT getting validated in lookup operation,
3564 * need to do it before permission check. */
3566 if (inode == inode->i_sb->s_root->d_inode) {
3567 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3568 MDS_INODELOCK_LOOKUP);
3573 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
3574 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
3576 /* squash fsuid/fsgid if needed */
3577 sbi = ll_i2sbi(inode);
3578 squash = &sbi->ll_squash;
3579 if (unlikely(squash->rsi_uid != 0 &&
3580 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
3581 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
3585 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
3586 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
3587 squash->rsi_uid, squash->rsi_gid);
3589 /* update current process's credentials
3590 * and FS capability */
3591 cred = prepare_creds();
3595 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
3596 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
3597 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
3598 if ((1 << cap) & CFS_CAP_FS_MASK)
3599 cap_lower(cred->cap_effective, cap);
3601 old_cred = override_creds(cred);
3604 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
3606 if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
3607 rc = lustre_check_remote_perm(inode, mask);
3609 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3611 /* restore current process's credentials and FS capability */
3613 revert_creds(old_cred);
3620 /* -o localflock - only provides locally consistent flock locks */
3621 struct file_operations ll_file_operations = {
3622 .read = ll_file_read,
3623 .aio_read = ll_file_aio_read,
3624 .write = ll_file_write,
3625 .aio_write = ll_file_aio_write,
3626 .unlocked_ioctl = ll_file_ioctl,
3627 .open = ll_file_open,
3628 .release = ll_file_release,
3629 .mmap = ll_file_mmap,
3630 .llseek = ll_file_seek,
3631 .splice_read = ll_file_splice_read,
3636 struct file_operations ll_file_operations_flock = {
3637 .read = ll_file_read,
3638 .aio_read = ll_file_aio_read,
3639 .write = ll_file_write,
3640 .aio_write = ll_file_aio_write,
3641 .unlocked_ioctl = ll_file_ioctl,
3642 .open = ll_file_open,
3643 .release = ll_file_release,
3644 .mmap = ll_file_mmap,
3645 .llseek = ll_file_seek,
3646 .splice_read = ll_file_splice_read,
3649 .flock = ll_file_flock,
3650 .lock = ll_file_flock
3653 /* These are for -o noflock - to return ENOSYS on flock calls */
3654 struct file_operations ll_file_operations_noflock = {
3655 .read = ll_file_read,
3656 .aio_read = ll_file_aio_read,
3657 .write = ll_file_write,
3658 .aio_write = ll_file_aio_write,
3659 .unlocked_ioctl = ll_file_ioctl,
3660 .open = ll_file_open,
3661 .release = ll_file_release,
3662 .mmap = ll_file_mmap,
3663 .llseek = ll_file_seek,
3664 .splice_read = ll_file_splice_read,
3667 .flock = ll_file_noflock,
3668 .lock = ll_file_noflock
3671 struct inode_operations ll_file_inode_operations = {
3672 .setattr = ll_setattr,
3673 .getattr = ll_getattr,
3674 .permission = ll_inode_permission,
3675 .setxattr = ll_setxattr,
3676 .getxattr = ll_getxattr,
3677 .listxattr = ll_listxattr,
3678 .removexattr = ll_removexattr,
3679 .fiemap = ll_fiemap,
3680 #ifdef HAVE_IOP_GET_ACL
3681 .get_acl = ll_get_acl,
3685 /* dynamic ioctl number support routins */
3686 static struct llioc_ctl_data {
3687 struct rw_semaphore ioc_sem;
3688 struct list_head ioc_head;
3690 __RWSEM_INITIALIZER(llioc.ioc_sem),
3691 LIST_HEAD_INIT(llioc.ioc_head)
3696 struct list_head iocd_list;
3697 unsigned int iocd_size;
3698 llioc_callback_t iocd_cb;
3699 unsigned int iocd_count;
3700 unsigned int iocd_cmd[0];
3703 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3706 struct llioc_data *in_data = NULL;
3709 if (cb == NULL || cmd == NULL ||
3710 count > LLIOC_MAX_CMD || count < 0)
3713 size = sizeof(*in_data) + count * sizeof(unsigned int);
3714 OBD_ALLOC(in_data, size);
3715 if (in_data == NULL)
3718 memset(in_data, 0, sizeof(*in_data));
3719 in_data->iocd_size = size;
3720 in_data->iocd_cb = cb;
3721 in_data->iocd_count = count;
3722 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3724 down_write(&llioc.ioc_sem);
3725 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3726 up_write(&llioc.ioc_sem);
3731 void ll_iocontrol_unregister(void *magic)
3733 struct llioc_data *tmp;
3738 down_write(&llioc.ioc_sem);
3739 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3741 unsigned int size = tmp->iocd_size;
3743 list_del(&tmp->iocd_list);
3744 up_write(&llioc.ioc_sem);
3746 OBD_FREE(tmp, size);
3750 up_write(&llioc.ioc_sem);
3752 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3755 EXPORT_SYMBOL(ll_iocontrol_register);
3756 EXPORT_SYMBOL(ll_iocontrol_unregister);
3758 static enum llioc_iter
3759 ll_iocontrol_call(struct inode *inode, struct file *file,
3760 unsigned int cmd, unsigned long arg, int *rcp)
3762 enum llioc_iter ret = LLIOC_CONT;
3763 struct llioc_data *data;
3764 int rc = -EINVAL, i;
3766 down_read(&llioc.ioc_sem);
3767 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3768 for (i = 0; i < data->iocd_count; i++) {
3769 if (cmd != data->iocd_cmd[i])
3772 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3776 if (ret == LLIOC_STOP)
3779 up_read(&llioc.ioc_sem);
3786 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3788 struct ll_inode_info *lli = ll_i2info(inode);
3789 struct cl_env_nest nest;
3794 if (lli->lli_clob == NULL)
3797 env = cl_env_nested_get(&nest);
3799 RETURN(PTR_ERR(env));
3801 result = cl_conf_set(env, lli->lli_clob, conf);
3802 cl_env_nested_put(&nest, env);
3804 if (conf->coc_opc == OBJECT_CONF_SET) {
3805 struct ldlm_lock *lock = conf->coc_lock;
3807 LASSERT(lock != NULL);
3808 LASSERT(ldlm_has_layout(lock));
3810 struct lustre_md *md = conf->u.coc_md;
3811 __u32 gen = LL_LAYOUT_GEN_EMPTY;
3813 /* it can only be allowed to match after layout is
3814 * applied to inode otherwise false layout would be
3815 * seen. Applying layout shoud happen before dropping
3816 * the intent lock. */
3817 ldlm_lock_allow_match(lock);
3819 lli->lli_has_smd = lsm_has_objects(md->lsm);
3820 if (md->lsm != NULL)
3821 gen = md->lsm->lsm_layout_gen;
3824 DFID ": layout version change: %u -> %u\n",
3825 PFID(&lli->lli_fid), ll_layout_version_get(lli),
3827 ll_layout_version_set(lli, gen);
3833 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3834 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3837 struct ll_sb_info *sbi = ll_i2sbi(inode);
3838 struct obd_capa *oc;
3839 struct ptlrpc_request *req;
3840 struct mdt_body *body;
3847 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3848 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
3849 lock->l_lvb_data, lock->l_lvb_len);
3851 if ((lock->l_lvb_data != NULL) && ldlm_is_lvb_ready(lock))
3854 /* if layout lock was granted right away, the layout is returned
3855 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3856 * blocked and then granted via completion ast, we have to fetch
3857 * layout here. Please note that we can't use the LVB buffer in
3858 * completion AST because it doesn't have a large enough buffer */
3859 oc = ll_mdscapa_get(inode);
3860 rc = ll_get_default_mdsize(sbi, &lmmsize);
3862 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3863 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3869 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3871 GOTO(out, rc = -EPROTO);
3873 lmmsize = body->mbo_eadatasize;
3874 if (lmmsize == 0) /* empty layout */
3877 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3879 GOTO(out, rc = -EFAULT);
3881 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3882 if (lvbdata == NULL)
3883 GOTO(out, rc = -ENOMEM);
3885 memcpy(lvbdata, lmm, lmmsize);
3886 lock_res_and_lock(lock);
3887 if (lock->l_lvb_data != NULL)
3888 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3890 lock->l_lvb_data = lvbdata;
3891 lock->l_lvb_len = lmmsize;
3892 unlock_res_and_lock(lock);
3897 ptlrpc_req_finished(req);
3902 * Apply the layout to the inode. Layout lock is held and will be released
3905 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3906 struct inode *inode, __u32 *gen, bool reconf)
3908 struct ll_inode_info *lli = ll_i2info(inode);
3909 struct ll_sb_info *sbi = ll_i2sbi(inode);
3910 struct ldlm_lock *lock;
3911 struct lustre_md md = { NULL };
3912 struct cl_object_conf conf;
3915 bool wait_layout = false;
3918 LASSERT(lustre_handle_is_used(lockh));
3920 lock = ldlm_handle2lock(lockh);
3921 LASSERT(lock != NULL);
3922 LASSERT(ldlm_has_layout(lock));
3924 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured: %d",
3925 PFID(&lli->lli_fid), inode, reconf);
3927 /* in case this is a caching lock and reinstate with new inode */
3928 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3930 lock_res_and_lock(lock);
3931 lvb_ready = ldlm_is_lvb_ready(lock);
3932 unlock_res_and_lock(lock);
3933 /* checking lvb_ready is racy but this is okay. The worst case is
3934 * that multi processes may configure the file on the same time. */
3936 if (lvb_ready || !reconf) {
3939 /* layout_gen must be valid if layout lock is not
3940 * cancelled and stripe has already set */
3941 *gen = ll_layout_version_get(lli);
3947 rc = ll_layout_fetch(inode, lock);
3951 /* for layout lock, lmm is returned in lock's lvb.
3952 * lvb_data is immutable if the lock is held so it's safe to access it
3953 * without res lock. See the description in ldlm_lock_decref_internal()
3954 * for the condition to free lvb_data of layout lock */
3955 if (lock->l_lvb_data != NULL) {
3956 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3957 lock->l_lvb_data, lock->l_lvb_len);
3959 *gen = LL_LAYOUT_GEN_EMPTY;
3961 *gen = md.lsm->lsm_layout_gen;
3964 CERROR("%s: file "DFID" unpackmd error: %d\n",
3965 ll_get_fsname(inode->i_sb, NULL, 0),
3966 PFID(&lli->lli_fid), rc);
3972 /* set layout to file. Unlikely this will fail as old layout was
3973 * surely eliminated */
3974 memset(&conf, 0, sizeof conf);
3975 conf.coc_opc = OBJECT_CONF_SET;
3976 conf.coc_inode = inode;
3977 conf.coc_lock = lock;
3978 conf.u.coc_md = &md;
3979 rc = ll_layout_conf(inode, &conf);
3982 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3984 /* refresh layout failed, need to wait */
3985 wait_layout = rc == -EBUSY;
3989 LDLM_LOCK_PUT(lock);
3990 ldlm_lock_decref(lockh, mode);
3992 /* wait for IO to complete if it's still being used. */
3994 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
3995 ll_get_fsname(inode->i_sb, NULL, 0),
3996 PFID(&lli->lli_fid), inode);
3998 memset(&conf, 0, sizeof conf);
3999 conf.coc_opc = OBJECT_CONF_WAIT;
4000 conf.coc_inode = inode;
4001 rc = ll_layout_conf(inode, &conf);
4005 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4006 ll_get_fsname(inode->i_sb, NULL, 0),
4007 PFID(&lli->lli_fid), rc);
4013 * This function checks if there exists a LAYOUT lock on the client side,
4014 * or enqueues it if it doesn't have one in cache.
4016 * This function will not hold layout lock so it may be revoked any time after
4017 * this function returns. Any operations depend on layout should be redone
4020 * This function should be called before lov_io_init() to get an uptodate
4021 * layout version, the caller should save the version number and after IO
4022 * is finished, this function should be called again to verify that layout
4023 * is not changed during IO time.
4025 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4027 struct ll_inode_info *lli = ll_i2info(inode);
4028 struct ll_sb_info *sbi = ll_i2sbi(inode);
4029 struct md_op_data *op_data;
4030 struct lookup_intent it;
4031 struct lustre_handle lockh;
4033 struct ldlm_enqueue_info einfo = {
4034 .ei_type = LDLM_IBITS,
4036 .ei_cb_bl = &ll_md_blocking_ast,
4037 .ei_cb_cp = &ldlm_completion_ast,
4042 *gen = ll_layout_version_get(lli);
4043 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
4047 LASSERT(fid_is_sane(ll_inode2fid(inode)));
4048 LASSERT(S_ISREG(inode->i_mode));
4050 /* take layout lock mutex to enqueue layout lock exclusively. */
4051 mutex_lock(&lli->lli_layout_mutex);
4054 /* mostly layout lock is caching on the local side, so try to match
4055 * it before grabbing layout lock mutex. */
4056 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4057 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4058 if (mode != 0) { /* hit cached lock */
4059 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4063 mutex_unlock(&lli->lli_layout_mutex);
4067 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4068 0, 0, LUSTRE_OPC_ANY, NULL);
4069 if (IS_ERR(op_data)) {
4070 mutex_unlock(&lli->lli_layout_mutex);
4071 RETURN(PTR_ERR(op_data));
4074 /* have to enqueue one */
4075 memset(&it, 0, sizeof(it));
4076 it.it_op = IT_LAYOUT;
4077 lockh.cookie = 0ULL;
4079 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4080 ll_get_fsname(inode->i_sb, NULL, 0),
4081 PFID(&lli->lli_fid), inode);
4083 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, &it, op_data, &lockh, 0);
4084 if (it.d.lustre.it_data != NULL)
4085 ptlrpc_req_finished(it.d.lustre.it_data);
4086 it.d.lustre.it_data = NULL;
4088 ll_finish_md_op_data(op_data);
4090 mode = it.d.lustre.it_lock_mode;
4091 it.d.lustre.it_lock_mode = 0;
4092 ll_intent_drop_lock(&it);
4095 /* set lock data in case this is a new lock */
4096 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4097 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
4101 mutex_unlock(&lli->lli_layout_mutex);
4107 * This function send a restore request to the MDT
4109 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4111 struct hsm_user_request *hur;
4115 len = sizeof(struct hsm_user_request) +
4116 sizeof(struct hsm_user_item);
4117 OBD_ALLOC(hur, len);
4121 hur->hur_request.hr_action = HUA_RESTORE;
4122 hur->hur_request.hr_archive_id = 0;
4123 hur->hur_request.hr_flags = 0;
4124 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4125 sizeof(hur->hur_user_item[0].hui_fid));
4126 hur->hur_user_item[0].hui_extent.offset = offset;
4127 hur->hur_user_item[0].hui_extent.length = length;
4128 hur->hur_request.hr_itemcount = 1;
4129 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,