1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <lustre_mdc.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include "llite_internal.h"
33 /* also used by llite/special.c:ll_special_open() */
34 struct ll_file_data *ll_file_data_get(void)
36 struct ll_file_data *fd;
38 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
42 static void ll_file_data_put(struct ll_file_data *fd)
45 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
48 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
49 struct lustre_handle *fh)
51 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
52 op_data->op_attr.ia_mode = inode->i_mode;
53 op_data->op_attr.ia_atime = inode->i_atime;
54 op_data->op_attr.ia_mtime = inode->i_mtime;
55 op_data->op_attr.ia_ctime = inode->i_ctime;
56 op_data->op_attr.ia_size = i_size_read(inode);
57 op_data->op_attr_blocks = inode->i_blocks;
58 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
59 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
60 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
61 op_data->op_capa1 = ll_mdscapa_get(inode);
64 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
65 struct obd_client_handle *och)
69 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
70 ATTR_MTIME_SET | ATTR_CTIME_SET;
72 if (!(och->och_flags & FMODE_WRITE))
75 if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
76 !S_ISREG(inode->i_mode))
77 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
79 ll_epoch_close(inode, op_data, &och, 0);
82 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
86 static int ll_close_inode_openhandle(struct obd_export *md_exp,
88 struct obd_client_handle *och)
90 struct obd_export *exp = ll_i2mdexp(inode);
91 struct md_op_data *op_data;
92 struct ptlrpc_request *req = NULL;
93 struct obd_device *obd = class_exp2obd(exp);
100 * XXX: in case of LMV, is this correct to access
103 CERROR("Invalid MDC connection handle "LPX64"\n",
104 ll_i2mdexp(inode)->exp_handle.h_cookie);
109 * here we check if this is forced umount. If so this is called on
110 * canceling "open lock" and we do not call md_close() in this case, as
111 * it will not be successful, as import is already deactivated.
116 OBD_ALLOC_PTR(op_data);
118 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
120 ll_prepare_close(inode, op_data, och);
121 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
122 rc = md_close(md_exp, op_data, och->och_mod, &req);
127 /* This close must have the epoch closed. */
128 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
129 LASSERT(epoch_close);
130 /* MDS has instructed us to obtain Size-on-MDS attribute from
131 * OSTs and send setattr to back to MDS. */
132 rc = ll_sizeonmds_update(inode, och->och_mod,
133 &och->och_fh, op_data->op_ioepoch);
135 CERROR("inode %lu mdc Size-on-MDS update failed: "
136 "rc = %d\n", inode->i_ino, rc);
140 CERROR("inode %lu mdc close failed: rc = %d\n",
143 ll_finish_md_op_data(op_data);
146 rc = ll_objects_destroy(req, inode);
148 CERROR("inode %lu ll_objects destroy: rc = %d\n",
155 if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
156 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
157 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
160 ptlrpc_close_replay_seq(req);
161 md_clear_open_replay_data(md_exp, och);
162 /* Free @och if it is not waiting for DONE_WRITING. */
163 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
166 if (req) /* This is close request */
167 ptlrpc_req_finished(req);
171 int ll_md_real_close(struct inode *inode, int flags)
173 struct ll_inode_info *lli = ll_i2info(inode);
174 struct obd_client_handle **och_p;
175 struct obd_client_handle *och;
180 if (flags & FMODE_WRITE) {
181 och_p = &lli->lli_mds_write_och;
182 och_usecount = &lli->lli_open_fd_write_count;
183 } else if (flags & FMODE_EXEC) {
184 och_p = &lli->lli_mds_exec_och;
185 och_usecount = &lli->lli_open_fd_exec_count;
187 LASSERT(flags & FMODE_READ);
188 och_p = &lli->lli_mds_read_och;
189 och_usecount = &lli->lli_open_fd_read_count;
192 down(&lli->lli_och_sem);
193 if (*och_usecount) { /* There are still users of this handle, so
195 up(&lli->lli_och_sem);
200 up(&lli->lli_och_sem);
202 if (och) { /* There might be a race and somebody have freed this och
204 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
211 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
214 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
215 struct ll_inode_info *lli = ll_i2info(inode);
219 /* clear group lock, if present */
220 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
221 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
222 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
223 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
227 /* Let's see if we have good enough OPEN lock on the file and if
228 we can skip talking to MDS */
229 if (file->f_dentry->d_inode) { /* Can this ever be false? */
231 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
232 struct lustre_handle lockh;
233 struct inode *inode = file->f_dentry->d_inode;
234 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
236 down(&lli->lli_och_sem);
237 if (fd->fd_omode & FMODE_WRITE) {
239 LASSERT(lli->lli_open_fd_write_count);
240 lli->lli_open_fd_write_count--;
241 } else if (fd->fd_omode & FMODE_EXEC) {
243 LASSERT(lli->lli_open_fd_exec_count);
244 lli->lli_open_fd_exec_count--;
247 LASSERT(lli->lli_open_fd_read_count);
248 lli->lli_open_fd_read_count--;
250 up(&lli->lli_och_sem);
252 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
253 LDLM_IBITS, &policy, lockmode,
255 rc = ll_md_real_close(file->f_dentry->d_inode,
259 CERROR("Releasing a file %p with negative dentry %p. Name %s",
260 file, file->f_dentry, file->f_dentry->d_name.name);
263 LUSTRE_FPRIVATE(file) = NULL;
264 ll_file_data_put(fd);
265 ll_capa_close(inode);
270 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
272 /* While this returns an error code, fput() the caller does not, so we need
273 * to make every effort to clean up all of our state here. Also, applications
274 * rarely check close errors and even if an error is returned they will not
275 * re-try the close call.
277 int ll_file_release(struct inode *inode, struct file *file)
279 struct ll_file_data *fd;
280 struct ll_sb_info *sbi = ll_i2sbi(inode);
281 struct ll_inode_info *lli = ll_i2info(inode);
282 struct lov_stripe_md *lsm = lli->lli_smd;
286 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
287 inode->i_generation, inode);
289 #ifdef CONFIG_FS_POSIX_ACL
290 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
291 inode == inode->i_sb->s_root->d_inode) {
292 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
295 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
296 fd->fd_flags &= ~LL_FILE_RMTACL;
297 rct_del(&sbi->ll_rct, cfs_curproc_pid());
298 et_search_free(&sbi->ll_et, cfs_curproc_pid());
303 if (inode->i_sb->s_root != file->f_dentry)
304 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
305 fd = LUSTRE_FPRIVATE(file);
308 /* The last ref on @file, maybe not the the owner pid of statahead.
309 * Different processes can open the same dir, "ll_opendir_key" means:
310 * it is me that should stop the statahead thread. */
311 if (lli->lli_opendir_key == fd)
312 ll_stop_statahead(inode, fd);
314 if (inode->i_sb->s_root == file->f_dentry) {
315 LUSTRE_FPRIVATE(file) = NULL;
316 ll_file_data_put(fd);
321 lov_test_and_clear_async_rc(lsm);
322 lli->lli_async_rc = 0;
324 rc = ll_md_close(sbi->ll_md_exp, inode, file);
328 static int ll_intent_file_open(struct file *file, void *lmm,
329 int lmmsize, struct lookup_intent *itp)
331 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
332 struct dentry *parent = file->f_dentry->d_parent;
333 const char *name = file->f_dentry->d_name.name;
334 const int len = file->f_dentry->d_name.len;
335 struct md_op_data *op_data;
336 struct ptlrpc_request *req;
343 /* Usually we come here only for NFSD, and we want open lock.
344 But we can also get here with pre 2.6.15 patchless kernels, and in
345 that case that lock is also ok */
346 /* We can also get here if there was cached open handle in revalidate_it
347 * but it disappeared while we were getting from there to ll_file_open.
348 * But this means this file was closed and immediatelly opened which
349 * makes a good candidate for using OPEN lock */
350 /* If lmmsize & lmm are not 0, we are just setting stripe info
351 * parameters. No need for the open lock */
352 if (!lmm && !lmmsize)
353 itp->it_flags |= MDS_OPEN_LOCK;
355 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
356 file->f_dentry->d_inode, name, len,
357 O_RDWR, LUSTRE_OPC_ANY, NULL);
359 RETURN(PTR_ERR(op_data));
361 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
362 0 /*unused */, &req, ll_md_blocking_ast, 0);
363 ll_finish_md_op_data(op_data);
365 /* reason for keep own exit path - don`t flood log
366 * with messages with -ESTALE errors.
368 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
369 it_open_error(DISP_OPEN_OPEN, itp))
371 ll_release_openhandle(file->f_dentry, itp);
375 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
376 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
377 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
381 if (itp->d.lustre.it_lock_mode)
382 md_set_lock_data(sbi->ll_md_exp,
383 &itp->d.lustre.it_lock_handle,
384 file->f_dentry->d_inode);
386 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
388 ptlrpc_req_finished(itp->d.lustre.it_data);
391 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
392 ll_intent_drop_lock(itp);
397 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
398 struct lookup_intent *it, struct obd_client_handle *och)
400 struct ptlrpc_request *req = it->d.lustre.it_data;
401 struct mdt_body *body;
405 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
406 LASSERT(body != NULL); /* reply already checked out */
408 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
409 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
410 och->och_fid = lli->lli_fid;
411 och->och_flags = it->it_flags;
412 lli->lli_ioepoch = body->ioepoch;
414 return md_set_open_replay_data(md_exp, och, req);
417 int ll_local_open(struct file *file, struct lookup_intent *it,
418 struct ll_file_data *fd, struct obd_client_handle *och)
420 struct inode *inode = file->f_dentry->d_inode;
421 struct ll_inode_info *lli = ll_i2info(inode);
424 LASSERT(!LUSTRE_FPRIVATE(file));
429 struct ptlrpc_request *req = it->d.lustre.it_data;
430 struct mdt_body *body;
433 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
437 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
438 if ((it->it_flags & FMODE_WRITE) &&
439 (body->valid & OBD_MD_FLSIZE))
440 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
441 lli->lli_ioepoch, PFID(&lli->lli_fid));
444 LUSTRE_FPRIVATE(file) = fd;
445 ll_readahead_init(inode, &fd->fd_ras);
446 fd->fd_omode = it->it_flags;
450 /* Open a file, and (for the very first open) create objects on the OSTs at
451 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
452 * creation or open until ll_lov_setstripe() ioctl is called. We grab
453 * lli_open_sem to ensure no other process will create objects, send the
454 * stripe MD to the MDS, or try to destroy the objects if that fails.
456 * If we already have the stripe MD locally then we don't request it in
457 * md_open(), by passing a lmm_size = 0.
459 * It is up to the application to ensure no other processes open this file
460 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
461 * used. We might be able to avoid races of that sort by getting lli_open_sem
462 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
463 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
465 int ll_file_open(struct inode *inode, struct file *file)
467 struct ll_inode_info *lli = ll_i2info(inode);
468 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
469 .it_flags = file->f_flags };
470 struct lov_stripe_md *lsm;
471 struct ptlrpc_request *req = NULL;
472 struct obd_client_handle **och_p;
474 struct ll_file_data *fd;
475 int rc = 0, opendir_set = 0;
478 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
479 inode->i_generation, inode, file->f_flags);
481 #ifdef HAVE_VFS_INTENT_PATCHES
484 it = file->private_data; /* XXX: compat macro */
485 file->private_data = NULL; /* prevent ll_local_open assertion */
488 fd = ll_file_data_get();
492 if (S_ISDIR(inode->i_mode)) {
493 spin_lock(&lli->lli_lock);
494 /* "lli->lli_opendir_pid != 0" means someone has set it.
495 * "lli->lli_sai != NULL" means the previous statahead has not
497 if (lli->lli_opendir_pid == 0 && lli->lli_sai == NULL) {
499 lli->lli_opendir_pid = cfs_curproc_pid();
500 lli->lli_opendir_key = fd;
501 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid())) {
502 /* Two cases for this:
503 * (1) The same process open such directory many times.
504 * (2) The old process opened the directory, and exited
505 * before its children processes. Then new process
506 * with the same pid opens such directory before the
507 * old process's children processes exit.
508 * Change the owner to the latest one. */
510 lli->lli_opendir_key = fd;
512 spin_unlock(&lli->lli_lock);
515 if (inode->i_sb->s_root == file->f_dentry) {
516 LUSTRE_FPRIVATE(file) = fd;
520 if (!it || !it->d.lustre.it_disposition) {
521 /* Convert f_flags into access mode. We cannot use file->f_mode,
522 * because everything but O_ACCMODE mask was stripped from
524 if ((oit.it_flags + 1) & O_ACCMODE)
526 if (file->f_flags & O_TRUNC)
527 oit.it_flags |= FMODE_WRITE;
529 /* kernel only call f_op->open in dentry_open. filp_open calls
530 * dentry_open after call to open_namei that checks permissions.
531 * Only nfsd_open call dentry_open directly without checking
532 * permissions and because of that this code below is safe. */
533 if (oit.it_flags & FMODE_WRITE)
534 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
536 /* We do not want O_EXCL here, presumably we opened the file
537 * already? XXX - NFS implications? */
538 oit.it_flags &= ~O_EXCL;
544 /* Let's see if we have file open on MDS already. */
545 if (it->it_flags & FMODE_WRITE) {
546 och_p = &lli->lli_mds_write_och;
547 och_usecount = &lli->lli_open_fd_write_count;
548 } else if (it->it_flags & FMODE_EXEC) {
549 och_p = &lli->lli_mds_exec_och;
550 och_usecount = &lli->lli_open_fd_exec_count;
552 och_p = &lli->lli_mds_read_och;
553 och_usecount = &lli->lli_open_fd_read_count;
556 down(&lli->lli_och_sem);
557 if (*och_p) { /* Open handle is present */
558 if (it_disposition(it, DISP_OPEN_OPEN)) {
559 /* Well, there's extra open request that we do not need,
560 let's close it somehow. This will decref request. */
561 rc = it_open_error(DISP_OPEN_OPEN, it);
563 ll_file_data_put(fd);
564 GOTO(out_och_free, rc);
566 ll_release_openhandle(file->f_dentry, it);
567 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
572 rc = ll_local_open(file, it, fd, NULL);
574 up(&lli->lli_och_sem);
575 ll_file_data_put(fd);
579 LASSERT(*och_usecount == 0);
580 if (!it->d.lustre.it_disposition) {
581 /* We cannot just request lock handle now, new ELC code
582 means that one of other OPEN locks for this file
583 could be cancelled, and since blocking ast handler
584 would attempt to grab och_sem as well, that would
585 result in a deadlock */
586 up(&lli->lli_och_sem);
587 it->it_flags |= O_CHECK_STALE;
588 rc = ll_intent_file_open(file, NULL, 0, it);
589 it->it_flags &= ~O_CHECK_STALE;
591 ll_file_data_put(fd);
592 GOTO(out_openerr, rc);
595 /* Got some error? Release the request */
596 if (it->d.lustre.it_status < 0) {
597 req = it->d.lustre.it_data;
598 ptlrpc_req_finished(req);
600 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
601 &it->d.lustre.it_lock_handle,
602 file->f_dentry->d_inode);
605 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
607 ll_file_data_put(fd);
608 GOTO(out_och_free, rc = -ENOMEM);
611 req = it->d.lustre.it_data;
613 /* md_intent_lock() didn't get a request ref if there was an
614 * open error, so don't do cleanup on the request here
616 /* XXX (green): Should not we bail out on any error here, not
617 * just open error? */
618 rc = it_open_error(DISP_OPEN_OPEN, it);
620 ll_file_data_put(fd);
621 GOTO(out_och_free, rc);
624 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
625 rc = ll_local_open(file, it, fd, *och_p);
627 up(&lli->lli_och_sem);
628 ll_file_data_put(fd);
629 GOTO(out_och_free, rc);
632 up(&lli->lli_och_sem);
634 /* Must do this outside lli_och_sem lock to prevent deadlock where
635 different kind of OPEN lock for this same inode gets cancelled
636 by ldlm_cancel_lru */
637 if (!S_ISREG(inode->i_mode))
644 if (file->f_flags & O_LOV_DELAY_CREATE ||
645 !(file->f_mode & FMODE_WRITE)) {
646 CDEBUG(D_INODE, "object creation was delayed\n");
650 file->f_flags &= ~O_LOV_DELAY_CREATE;
653 ptlrpc_req_finished(req);
655 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
659 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
660 *och_p = NULL; /* OBD_FREE writes some magic there */
663 up(&lli->lli_och_sem);
665 if (opendir_set == 1) {
666 lli->lli_opendir_key = NULL;
667 lli->lli_opendir_pid = 0;
668 } else if (unlikely(opendir_set == 2)) {
669 ll_stop_statahead(inode, fd);
676 /* Fills the obdo with the attributes for the inode defined by lsm */
677 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
679 struct ptlrpc_request_set *set;
680 struct ll_inode_info *lli = ll_i2info(inode);
681 struct lov_stripe_md *lsm = lli->lli_smd;
683 struct obd_info oinfo = { { { 0 } } };
687 LASSERT(lsm != NULL);
691 oinfo.oi_oa->o_id = lsm->lsm_object_id;
692 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
693 oinfo.oi_oa->o_mode = S_IFREG;
694 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
695 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
696 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
697 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
699 oinfo.oi_capa = ll_mdscapa_get(inode);
701 set = ptlrpc_prep_set();
703 CERROR("can't allocate ptlrpc set\n");
706 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
708 rc = ptlrpc_set_wait(set);
709 ptlrpc_set_destroy(set);
711 capa_put(oinfo.oi_capa);
715 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
716 OBD_MD_FLATIME | OBD_MD_FLMTIME |
717 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
719 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
720 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
721 lli->lli_smd->lsm_object_id, i_size_read(inode),
722 (unsigned long long)inode->i_blocks,
723 (unsigned long)ll_inode_blksize(inode));
727 static inline void ll_remove_suid(struct inode *inode)
731 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
732 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
734 /* was any of the uid bits set? */
735 mode &= inode->i_mode;
736 if (mode && !capable(CAP_FSETID)) {
737 inode->i_mode &= ~mode;
738 // XXX careful here - we cannot change the size
742 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
744 struct ll_inode_info *lli = ll_i2info(inode);
745 struct lov_stripe_md *lsm = lli->lli_smd;
746 struct obd_export *exp = ll_i2dtexp(inode);
749 struct ldlm_lock *lock;
750 struct lov_stripe_md *lsm;
751 } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock, .lsm = lsm };
752 __u32 stripe, vallen = sizeof(stripe);
753 struct lov_oinfo *loinfo;
757 if (lsm->lsm_stripe_count == 1)
758 GOTO(check, stripe = 0);
760 /* get our offset in the lov */
761 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
763 CERROR("obd_get_info: rc = %d\n", rc);
766 LASSERT(stripe < lsm->lsm_stripe_count);
769 loinfo = lsm->lsm_oinfo[stripe];
770 if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr,
771 &lock->l_resource->lr_name)){
772 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
773 loinfo->loi_id, loinfo->loi_gr);
774 RETURN(-ELDLM_NO_LOCK_DATA);
780 /* Get extra page reference to ensure it is not going away */
781 void ll_pin_extent_cb(void *data)
783 struct page *page = data;
785 page_cache_get(page);
790 /* Flush the page from page cache for an extent as its canceled.
791 * Page to remove is delivered as @data.
793 * No one can dirty the extent until we've finished our work and they cannot
794 * enqueue another lock. The DLM protects us from ll_file_read/write here,
795 * but other kernel actors could have pages locked.
797 * If @discard is set, there is no need to write the page if it is dirty.
799 * Called with the DLM lock held. */
800 int ll_page_removal_cb(void *data, int discard)
803 struct page *page = data;
804 struct address_space *mapping;
808 /* We have page reference already from ll_pin_page */
811 /* Already truncated by somebody */
814 mapping = page->mapping;
816 ll_teardown_mmaps(mapping,
817 (__u64)page->index << PAGE_CACHE_SHIFT,
818 ((__u64)page->index<<PAGE_CACHE_SHIFT)|
820 LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
822 if (!discard && clear_page_dirty_for_io(page)) {
823 LASSERT(page->mapping);
824 rc = ll_call_writepage(page->mapping->host, page);
825 /* either waiting for io to complete or reacquiring
826 * the lock that the failed writepage released */
828 wait_on_page_writeback(page);
830 CERROR("writepage inode %lu(%p) of page %p "
831 "failed: %d\n", mapping->host->i_ino,
832 mapping->host, page, rc);
834 set_bit(AS_ENOSPC, &mapping->flags);
836 set_bit(AS_EIO, &mapping->flags);
838 set_bit(AS_EIO, &mapping->flags);
840 if (page->mapping != NULL) {
841 struct ll_async_page *llap = llap_cast_private(page);
842 /* checking again to account for writeback's lock_page() */
843 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
845 ll_ra_accounting(llap, page->mapping);
846 ll_truncate_complete_page(page);
850 LASSERT(!PageWriteback(page));
852 page_cache_release(page);
857 int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
858 void *data, int flag)
861 struct ll_inode_info *lli;
862 struct lov_stripe_md *lsm;
868 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
869 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
873 inode = ll_inode_from_lock(lock);
876 lli = ll_i2info(inode);
879 if (lli->lli_smd == NULL)
883 stripe = ll_lock_to_stripe_offset(inode, lock);
887 lov_stripe_lock(lsm);
888 lock_res_and_lock(lock);
889 kms = ldlm_extent_shift_kms(lock,
890 lsm->lsm_oinfo[stripe]->loi_kms);
892 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
893 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
894 lsm->lsm_oinfo[stripe]->loi_kms, kms);
895 lsm->lsm_oinfo[stripe]->loi_kms = kms;
896 unlock_res_and_lock(lock);
897 lov_stripe_unlock(lsm);
898 ll_queue_done_writing(inode, 0);
907 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
909 /* XXX ALLOCATE - 160 bytes */
910 struct inode *inode = ll_inode_from_lock(lock);
911 struct ll_inode_info *lli = ll_i2info(inode);
912 struct lustre_handle lockh = { 0 };
917 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
918 LDLM_FL_BLOCK_CONV)) {
919 LBUG(); /* not expecting any blocked async locks yet */
920 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
922 ldlm_lock_dump(D_OTHER, lock, 0);
923 ldlm_reprocess_all(lock->l_resource);
927 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
929 stripe = ll_lock_to_stripe_offset(inode, lock);
933 if (lock->l_lvb_len) {
934 struct lov_stripe_md *lsm = lli->lli_smd;
936 lvb = lock->l_lvb_data;
937 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
939 lock_res_and_lock(lock);
940 ll_inode_size_lock(inode, 1);
941 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
942 kms = ldlm_extent_shift_kms(NULL, kms);
943 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
944 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
945 lsm->lsm_oinfo[stripe].loi_kms, kms);
946 lsm->lsm_oinfo[stripe].loi_kms = kms;
947 ll_inode_size_unlock(inode, 1);
948 unlock_res_and_lock(lock);
953 wake_up(&lock->l_waitq);
955 ldlm_lock2handle(lock, &lockh);
956 ldlm_lock_decref(&lockh, LCK_PR);
961 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
963 struct ptlrpc_request *req = reqp;
964 struct inode *inode = ll_inode_from_lock(lock);
965 struct ll_inode_info *lli;
966 struct lov_stripe_md *lsm;
972 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
973 lli = ll_i2info(inode);
975 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
978 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
980 /* First, find out which stripe index this lock corresponds to. */
981 stripe = ll_lock_to_stripe_offset(inode, lock);
983 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
985 req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
986 req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
988 rc = req_capsule_server_pack(&req->rq_pill);
990 CERROR("lustre_pack_reply: %d\n", rc);
994 lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
995 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
996 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
997 lvb->lvb_atime = LTIME_S(inode->i_atime);
998 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1000 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1001 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1002 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1003 lvb->lvb_atime, lvb->lvb_ctime);
1008 /* These errors are normal races, so we don't want to fill the console
1009 * with messages by calling ptlrpc_error() */
1010 if (rc == -ELDLM_NO_LOCK_DATA)
1011 lustre_pack_reply(req, 1, NULL, NULL);
1013 req->rq_status = rc;
1017 static int ll_merge_lvb(struct inode *inode)
1019 struct ll_inode_info *lli = ll_i2info(inode);
1020 struct ll_sb_info *sbi = ll_i2sbi(inode);
1026 ll_inode_size_lock(inode, 1);
1027 inode_init_lvb(inode, &lvb);
1028 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1029 i_size_write(inode, lvb.lvb_size);
1030 inode->i_blocks = lvb.lvb_blocks;
1032 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1033 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1034 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1035 ll_inode_size_unlock(inode, 1);
1040 int ll_local_size(struct inode *inode)
1042 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1043 struct ll_inode_info *lli = ll_i2info(inode);
1044 struct ll_sb_info *sbi = ll_i2sbi(inode);
1045 struct lustre_handle lockh = { 0 };
1050 if (lli->lli_smd->lsm_stripe_count == 0)
1053 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1054 &policy, LCK_PR, &flags, inode, &lockh);
1060 rc = ll_merge_lvb(inode);
1061 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
1065 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1068 struct lustre_handle lockh = { 0 };
1069 struct ldlm_enqueue_info einfo = { 0 };
1070 struct obd_info oinfo = { { { 0 } } };
1076 einfo.ei_type = LDLM_EXTENT;
1077 einfo.ei_mode = LCK_PR;
1078 einfo.ei_cb_bl = osc_extent_blocking_cb;
1079 einfo.ei_cb_cp = ldlm_completion_ast;
1080 einfo.ei_cb_gl = ll_glimpse_callback;
1081 einfo.ei_cbdata = NULL;
1083 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1084 oinfo.oi_lockh = &lockh;
1086 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1088 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1092 CERROR("obd_enqueue returned rc %d, "
1093 "returning -EIO\n", rc);
1094 RETURN(rc > 0 ? -EIO : rc);
1097 lov_stripe_lock(lsm);
1098 memset(&lvb, 0, sizeof(lvb));
1099 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1100 st->st_size = lvb.lvb_size;
1101 st->st_blocks = lvb.lvb_blocks;
1102 st->st_mtime = lvb.lvb_mtime;
1103 st->st_atime = lvb.lvb_atime;
1104 st->st_ctime = lvb.lvb_ctime;
1105 lov_stripe_unlock(lsm);
1110 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1111 * file (because it prefers KMS over RSS when larger) */
1112 int ll_glimpse_size(struct inode *inode, int ast_flags)
1114 struct ll_inode_info *lli = ll_i2info(inode);
1115 struct ll_sb_info *sbi = ll_i2sbi(inode);
1116 struct lustre_handle lockh = { 0 };
1117 struct ldlm_enqueue_info einfo = { 0 };
1118 struct obd_info oinfo = { { { 0 } } };
1122 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1125 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1127 if (!lli->lli_smd) {
1128 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1132 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1133 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1134 * won't revoke any conflicting DLM locks held. Instead,
1135 * ll_glimpse_callback() will be called on each client
1136 * holding a DLM lock against this file, and resulting size
1137 * will be returned for each stripe. DLM lock on [0, EOF] is
1138 * acquired only if there were no conflicting locks. */
1139 einfo.ei_type = LDLM_EXTENT;
1140 einfo.ei_mode = LCK_PR;
1141 einfo.ei_cb_bl = osc_extent_blocking_cb;
1142 einfo.ei_cb_cp = ldlm_completion_ast;
1143 einfo.ei_cb_gl = ll_glimpse_callback;
1144 einfo.ei_cbdata = inode;
1146 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1147 oinfo.oi_lockh = &lockh;
1148 oinfo.oi_md = lli->lli_smd;
1149 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1151 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1155 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1156 RETURN(rc > 0 ? -EIO : rc);
1159 rc = ll_merge_lvb(inode);
1161 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1162 i_size_read(inode), (unsigned long long)inode->i_blocks);
1167 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1168 struct lov_stripe_md *lsm, int mode,
1169 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1172 struct ll_sb_info *sbi = ll_i2sbi(inode);
1174 struct ldlm_enqueue_info einfo = { 0 };
1175 struct obd_info oinfo = { { { 0 } } };
1179 LASSERT(!lustre_handle_is_used(lockh));
1180 LASSERT(lsm != NULL);
1182 /* don't drop the mmapped file to LRU */
1183 if (mapping_mapped(inode->i_mapping))
1184 ast_flags |= LDLM_FL_NO_LRU;
1186 /* XXX phil: can we do this? won't it screw the file size up? */
1187 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1188 (sbi->ll_flags & LL_SBI_NOLCK))
1191 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1192 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1194 einfo.ei_type = LDLM_EXTENT;
1195 einfo.ei_mode = mode;
1196 einfo.ei_cb_bl = osc_extent_blocking_cb;
1197 einfo.ei_cb_cp = ldlm_completion_ast;
1198 einfo.ei_cb_gl = ll_glimpse_callback;
1199 einfo.ei_cbdata = inode;
1201 oinfo.oi_policy = *policy;
1202 oinfo.oi_lockh = lockh;
1204 oinfo.oi_flags = ast_flags;
1206 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1207 *policy = oinfo.oi_policy;
1211 ll_inode_size_lock(inode, 1);
1212 inode_init_lvb(inode, &lvb);
1213 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1215 if (policy->l_extent.start == 0 &&
1216 policy->l_extent.end == OBD_OBJECT_EOF) {
1217 /* vmtruncate()->ll_truncate() first sets the i_size and then
1218 * the kms under both a DLM lock and the
1219 * ll_inode_size_lock(). If we don't get the
1220 * ll_inode_size_lock() here we can match the DLM lock and
1221 * reset i_size from the kms before the truncating path has
1222 * updated the kms. generic_file_write can then trust the
1223 * stale i_size when doing appending writes and effectively
1224 * cancel the result of the truncate. Getting the
1225 * ll_inode_size_lock() after the enqueue maintains the DLM
1226 * -> ll_inode_size_lock() acquiring order. */
1227 i_size_write(inode, lvb.lvb_size);
1228 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1229 inode->i_ino, i_size_read(inode));
1233 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1234 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1235 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1237 ll_inode_size_unlock(inode, 1);
1242 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1243 struct lov_stripe_md *lsm, int mode,
1244 struct lustre_handle *lockh)
1246 struct ll_sb_info *sbi = ll_i2sbi(inode);
1250 /* XXX phil: can we do this? won't it screw the file size up? */
1251 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1252 (sbi->ll_flags & LL_SBI_NOLCK))
1255 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1260 static void ll_set_file_contended(struct inode *inode)
1262 struct ll_inode_info *lli = ll_i2info(inode);
1263 cfs_time_t now = cfs_time_current();
1265 spin_lock(&lli->lli_lock);
1266 lli->lli_contention_time = now;
1267 lli->lli_flags |= LLIF_CONTENDED;
1268 spin_unlock(&lli->lli_lock);
1271 void ll_clear_file_contended(struct inode *inode)
1273 struct ll_inode_info *lli = ll_i2info(inode);
1275 spin_lock(&lli->lli_lock);
1276 lli->lli_flags &= ~LLIF_CONTENDED;
1277 spin_unlock(&lli->lli_lock);
1280 static int ll_is_file_contended(struct file *file)
1282 struct inode *inode = file->f_dentry->d_inode;
1283 struct ll_inode_info *lli = ll_i2info(inode);
1284 struct ll_sb_info *sbi = ll_i2sbi(inode);
1285 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1288 if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1289 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1290 " osc connect flags = 0x"LPX64"\n",
1291 sbi->ll_lco.lco_flags);
1294 if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1296 if (lli->lli_flags & LLIF_CONTENDED) {
1297 cfs_time_t cur_time = cfs_time_current();
1298 cfs_time_t retry_time;
1300 retry_time = cfs_time_add(
1301 lli->lli_contention_time,
1302 cfs_time_seconds(sbi->ll_contention_time));
1303 if (cfs_time_after(cur_time, retry_time)) {
1304 ll_clear_file_contended(inode);
1312 static int ll_file_get_tree_lock(struct ll_lock_tree *tree, struct file *file,
1313 const char *buf, size_t count,
1314 loff_t start, loff_t end, int rw)
1317 int tree_locked = 0;
1319 struct inode * inode = file->f_dentry->d_inode;
1322 append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
1324 if (append || !ll_is_file_contended(file)) {
1325 struct ll_lock_tree_node *node;
1328 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1329 if (file->f_flags & O_NONBLOCK)
1330 ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1331 node = ll_node_from_inode(inode, start, end,
1332 (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
1337 tree->lt_fd = LUSTRE_FPRIVATE(file);
1338 rc = ll_tree_lock(tree, node, buf, count, ast_flags);
1341 else if (rc == -EUSERS)
1342 ll_set_file_contended(inode);
1346 RETURN(tree_locked);
1352 * Checks if requested extent lock is compatible with a lock under a page.
1354 * Checks if the lock under \a page is compatible with a read or write lock
1355 * (specified by \a rw) for an extent [\a start , \a end].
1357 * \param page the page under which lock is considered
1358 * \param rw OBD_BRW_READ if requested for reading,
1359 * OBD_BRW_WRITE if requested for writing
1360 * \param start start of the requested extent
1361 * \param end end of the requested extent
1362 * \param cookie transparent parameter for passing locking context
1364 * \post result == 1, *cookie == context, appropriate lock is referenced or
1367 * \retval 1 owned lock is reused for the request
1368 * \retval 0 no lock reused for the request
1370 * \see ll_release_short_lock
1372 static int ll_reget_short_lock(struct page *page, int rw,
1373 obd_off start, obd_off end,
1376 struct ll_async_page *llap;
1377 struct obd_export *exp;
1378 struct inode *inode = page->mapping->host;
1382 exp = ll_i2dtexp(inode);
1386 llap = llap_cast_private(page);
1390 RETURN(obd_reget_short_lock(exp, ll_i2info(inode)->lli_smd,
1391 &llap->llap_cookie, rw, start, end,
1396 * Releases a reference to a lock taken in a "fast" way.
1398 * Releases a read or a write (specified by \a rw) lock
1399 * referenced by \a cookie.
1401 * \param inode inode to which data belong
1402 * \param end end of the locked extent
1403 * \param rw OBD_BRW_READ if requested for reading,
1404 * OBD_BRW_WRITE if requested for writing
1405 * \param cookie transparent parameter for passing locking context
1407 * \post appropriate lock is dereferenced
1409 * \see ll_reget_short_lock
1411 static void ll_release_short_lock(struct inode *inode, obd_off end,
1412 void *cookie, int rw)
1414 struct obd_export *exp;
1417 exp = ll_i2dtexp(inode);
1421 rc = obd_release_short_lock(exp, ll_i2info(inode)->lli_smd, end,
1424 CERROR("unlock failed (%d)\n", rc);
1428 * Checks if requested extent lock is compatible
1429 * with a lock under a page in page cache.
1431 * Checks if a lock under some \a page is compatible with a read or write lock
1432 * (specified by \a rw) for an extent [\a start , \a end].
1434 * \param file the file under which lock is considered
1435 * \param rw OBD_BRW_READ if requested for reading,
1436 * OBD_BRW_WRITE if requested for writing
1437 * \param ppos start of the requested extent
1438 * \param end end of the requested extent
1439 * \param cookie transparent parameter for passing locking context
1440 * \param buf userspace buffer for the data
1442 * \post result == 1, *cookie == context, appropriate lock is referenced
1445 * \retval 1 owned lock is reused for the request
1446 * \retval 0 no lock reused for the request
1448 * \see ll_file_put_fast_lock
1450 static inline int ll_file_get_fast_lock(struct file *file,
1451 obd_off ppos, obd_off end,
1452 char *buf, void **cookie, int rw)
1459 if (!ll_region_mapped((unsigned long)buf, end - ppos)) {
1460 page = find_lock_page(file->f_dentry->d_inode->i_mapping,
1461 ppos >> CFS_PAGE_SHIFT);
1463 if (ll_reget_short_lock(page, rw, ppos, end, cookie))
1467 page_cache_release(page);
1475 * Releases a reference to a lock taken in a "fast" way.
1477 * Releases a read or a write (specified by \a rw) lock
1478 * referenced by \a cookie.
1480 * \param inode inode to which data belong
1481 * \param end end of the locked extent
1482 * \param rw OBD_BRW_READ if requested for reading,
1483 * OBD_BRW_WRITE if requested for writing
1484 * \param cookie transparent parameter for passing locking context
1486 * \post appropriate lock is dereferenced
1488 * \see ll_file_get_fast_lock
1490 static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
1491 void *cookie, int rw)
1493 ll_release_short_lock(inode, end, cookie, rw);
1496 enum ll_lock_style {
1497 LL_LOCK_STYLE_NOLOCK = 0,
1498 LL_LOCK_STYLE_FASTLOCK = 1,
1499 LL_LOCK_STYLE_TREELOCK = 2
1503 * Checks if requested extent lock is compatible with a lock
1504 * under a page cache page.
1506 * Checks if the lock under \a page is compatible with a read or write lock
1507 * (specified by \a rw) for an extent [\a start , \a end].
1509 * \param file file under which I/O is processed
1510 * \param rw OBD_BRW_READ if requested for reading,
1511 * OBD_BRW_WRITE if requested for writing
1512 * \param ppos start of the requested extent
1513 * \param end end of the requested extent
1514 * \param cookie transparent parameter for passing locking context
1515 * (only used with LL_LOCK_STYLE_FASTLOCK)
1516 * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
1517 * \param buf userspace buffer for the data
1519 * \retval LL_LOCK_STYLE_FASTLOCK owned lock is reused through fast lock
1520 * \retval LL_LOCK_STYLE_TREELOCK got a lock through tree lock
1521 * \retval LL_LOCK_STYLE_NOLOCK got no lock
1523 * \see ll_file_put_lock
1525 static inline int ll_file_get_lock(struct file *file, obd_off ppos,
1526 obd_off end, char *buf, void **cookie,
1527 struct ll_lock_tree *tree, int rw)
1533 if (ll_file_get_fast_lock(file, ppos, end, buf, cookie, rw))
1534 RETURN(LL_LOCK_STYLE_FASTLOCK);
1536 rc = ll_file_get_tree_lock(tree, file, buf, ppos - end, ppos, end, rw);
1537 /* rc: 1 for tree lock, 0 for no lock, <0 for error */
1540 RETURN(LL_LOCK_STYLE_TREELOCK);
1542 RETURN(LL_LOCK_STYLE_NOLOCK);
1545 /* an error happened if we reached this point, rc = -errno here */
1550 * Drops the lock taken by ll_file_get_lock.
1552 * Releases a read or a write (specified by \a rw) lock
1553 * referenced by \a tree or \a cookie.
1555 * \param inode inode to which data belong
1556 * \param end end of the locked extent
1557 * \param lockstyle facility through which the lock was taken
1558 * \param rw OBD_BRW_READ if requested for reading,
1559 * OBD_BRW_WRITE if requested for writing
1560 * \param cookie transparent parameter for passing locking context
1561 * (only used with LL_LOCK_STYLE_FASTLOCK)
1562 * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
1564 * \post appropriate lock is dereferenced
1566 * \see ll_file_get_lock
1568 static inline void ll_file_put_lock(struct inode *inode, obd_off end,
1569 enum ll_lock_style lock_style,
1570 void *cookie, struct ll_lock_tree *tree,
1574 switch (lock_style) {
1575 case LL_LOCK_STYLE_TREELOCK:
1576 ll_tree_unlock(tree);
1578 case LL_LOCK_STYLE_FASTLOCK:
1579 ll_file_put_fast_lock(inode, end, cookie, rw);
1582 CERROR("invalid locking style (%d)\n", lock_style);
1586 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1589 struct inode *inode = file->f_dentry->d_inode;
1590 struct ll_inode_info *lli = ll_i2info(inode);
1591 struct lov_stripe_md *lsm = lli->lli_smd;
1592 struct ll_sb_info *sbi = ll_i2sbi(inode);
1593 struct ll_lock_tree tree;
1595 struct ll_ra_read bead;
1598 ssize_t retval, chunk, sum = 0;
1604 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1605 inode->i_ino, inode->i_generation, inode, count, *ppos);
1606 /* "If nbyte is 0, read() will return 0 and have no other results."
1607 * -- Single Unix Spec */
1611 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1614 /* Read on file with no objects should return zero-filled
1615 * buffers up to file size (we can get non-zero sizes with
1616 * mknod + truncate, then opening file for read. This is a
1617 * common pattern in NFS case, it seems). Bug 6243 */
1619 /* Since there are no objects on OSTs, we have nothing to get
1620 * lock on and so we are forced to access inode->i_size
1623 /* Read beyond end of file */
1624 if (*ppos >= i_size_read(inode))
1627 if (count > i_size_read(inode) - *ppos)
1628 count = i_size_read(inode) - *ppos;
1629 /* Make sure to correctly adjust the file pos pointer for
1631 notzeroed = clear_user(buf, count);
1639 if (sbi->ll_max_rw_chunk != 0) {
1640 /* first, let's know the end of the current stripe */
1642 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
1644 /* correct, the end is beyond the request */
1645 if (end > *ppos + count - 1)
1646 end = *ppos + count - 1;
1648 /* and chunk shouldn't be too large even if striping is wide */
1649 if (end - *ppos > sbi->ll_max_rw_chunk)
1650 end = *ppos + sbi->ll_max_rw_chunk - 1;
1652 end = *ppos + count - 1;
1655 lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
1656 buf, &cookie, &tree, OBD_BRW_READ);
1658 GOTO(out, retval = lock_style);
1660 ll_inode_size_lock(inode, 1);
1662 * Consistency guarantees: following possibilities exist for the
1663 * relation between region being read and real file size at this
1666 * (A): the region is completely inside of the file;
1668 * (B-x): x bytes of region are inside of the file, the rest is
1671 * (C): the region is completely outside of the file.
1673 * This classification is stable under DLM lock acquired by
1674 * ll_tree_lock() above, because to change class, other client has to
1675 * take DLM lock conflicting with our lock. Also, any updates to
1676 * ->i_size by other threads on this client are serialized by
1677 * ll_inode_size_lock(). This guarantees that short reads are handled
1678 * correctly in the face of concurrent writes and truncates.
1680 inode_init_lvb(inode, &lvb);
1681 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1683 if (*ppos + count - 1 > kms) {
1684 /* A glimpse is necessary to determine whether we return a
1685 * short read (B) or some zeroes at the end of the buffer (C) */
1686 ll_inode_size_unlock(inode, 1);
1687 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1689 if (lock_style != LL_LOCK_STYLE_NOLOCK)
1690 ll_file_put_lock(inode, end, lock_style,
1691 cookie, &tree, OBD_BRW_READ);
1695 /* region is within kms and, hence, within real file size (A).
1696 * We need to increase i_size to cover the read region so that
1697 * generic_file_read() will do its job, but that doesn't mean
1698 * the kms size is _correct_, it is only the _minimum_ size.
1699 * If someone does a stat they will get the correct size which
1700 * will always be >= the kms value here. b=11081 */
1701 if (i_size_read(inode) < kms)
1702 i_size_write(inode, kms);
1703 ll_inode_size_unlock(inode, 1);
1706 chunk = end - *ppos + 1;
1707 CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1708 inode->i_ino, chunk, *ppos, i_size_read(inode));
1710 if (lock_style != LL_LOCK_STYLE_NOLOCK) {
1711 /* turn off the kernel's read-ahead */
1712 file->f_ra.ra_pages = 0;
1714 /* initialize read-ahead window once per syscall */
1717 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1718 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1719 ll_ra_read_in(file, &bead);
1723 file_accessed(file);
1724 retval = generic_file_read(file, buf, chunk, ppos);
1725 ll_file_put_lock(inode, end, lock_style, cookie, &tree,
1728 retval = ll_file_lockless_io(file, buf, chunk, ppos, READ);
1731 ll_rw_stats_tally(sbi, current->pid, file, chunk, 0);
1737 if (retval == chunk && count > 0)
1743 ll_ra_read_ex(file, &bead);
1744 retval = (sum > 0) ? sum : retval;
1749 * Write to a file (through the page cache).
1751 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1754 struct inode *inode = file->f_dentry->d_inode;
1755 struct ll_sb_info *sbi = ll_i2sbi(inode);
1756 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1757 struct ll_lock_tree tree;
1758 loff_t maxbytes = ll_file_maxbytes(inode);
1759 loff_t lock_start, lock_end, end;
1760 ssize_t retval, chunk, sum = 0;
1764 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1765 inode->i_ino, inode->i_generation, inode, count, *ppos);
1767 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1769 /* POSIX, but surprised the VFS doesn't check this already */
1773 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1774 * called on the file, don't fail the below assertion (bug 2388). */
1775 if (file->f_flags & O_LOV_DELAY_CREATE &&
1776 ll_i2info(inode)->lli_smd == NULL)
1779 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1781 down(&ll_i2info(inode)->lli_write_sem);
1784 chunk = 0; /* just to fix gcc's warning */
1785 end = *ppos + count - 1;
1787 if (file->f_flags & O_APPEND) {
1789 lock_end = OBD_OBJECT_EOF;
1790 } else if (sbi->ll_max_rw_chunk != 0) {
1791 /* first, let's know the end of the current stripe */
1793 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1796 /* correct, the end is beyond the request */
1797 if (end > *ppos + count - 1)
1798 end = *ppos + count - 1;
1800 /* and chunk shouldn't be too large even if striping is wide */
1801 if (end - *ppos > sbi->ll_max_rw_chunk)
1802 end = *ppos + sbi->ll_max_rw_chunk - 1;
1807 lock_end = *ppos + count - 1;
1810 tree_locked = ll_file_get_tree_lock(&tree, file, buf, count,
1811 lock_start, lock_end, OBD_BRW_WRITE);
1812 if (tree_locked < 0)
1813 GOTO(out, retval = tree_locked);
1815 /* This is ok, g_f_w will overwrite this under i_sem if it races
1816 * with a local truncate, it just makes our maxbyte checking easier.
1817 * The i_size value gets updated in ll_extent_lock() as a consequence
1818 * of the [0,EOF] extent lock we requested above. */
1819 if (file->f_flags & O_APPEND) {
1820 *ppos = i_size_read(inode);
1821 end = *ppos + count - 1;
1824 if (*ppos >= maxbytes) {
1825 send_sig(SIGXFSZ, current, 0);
1826 GOTO(out_unlock, retval = -EFBIG);
1828 if (end > maxbytes - 1)
1831 /* generic_file_write handles O_APPEND after getting i_mutex */
1832 chunk = end - *ppos + 1;
1833 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1834 inode->i_ino, chunk, *ppos);
1836 retval = generic_file_write(file, buf, chunk, ppos);
1838 retval = ll_file_lockless_io(file, (char*)buf, chunk,
1840 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1844 ll_tree_unlock(&tree);
1851 if (retval == chunk && count > 0)
1855 up(&ll_i2info(inode)->lli_write_sem);
1857 retval = (sum > 0) ? sum : retval;
1858 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1859 retval > 0 ? retval : 0);
1864 * Send file content (through pagecache) somewhere with helper
1866 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1867 read_actor_t actor, void *target)
1869 struct inode *inode = in_file->f_dentry->d_inode;
1870 struct ll_inode_info *lli = ll_i2info(inode);
1871 struct lov_stripe_md *lsm = lli->lli_smd;
1872 struct ll_lock_tree tree;
1873 struct ll_lock_tree_node *node;
1875 struct ll_ra_read bead;
1880 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1881 inode->i_ino, inode->i_generation, inode, count, *ppos);
1883 /* "If nbyte is 0, read() will return 0 and have no other results."
1884 * -- Single Unix Spec */
1888 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1889 /* turn off the kernel's read-ahead */
1890 in_file->f_ra.ra_pages = 0;
1892 /* File with no objects, nothing to lock */
1894 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1896 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1898 RETURN(PTR_ERR(node));
1900 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1901 rc = ll_tree_lock(&tree, node, NULL, count,
1902 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1906 ll_clear_file_contended(inode);
1907 ll_inode_size_lock(inode, 1);
1909 * Consistency guarantees: following possibilities exist for the
1910 * relation between region being read and real file size at this
1913 * (A): the region is completely inside of the file;
1915 * (B-x): x bytes of region are inside of the file, the rest is
1918 * (C): the region is completely outside of the file.
1920 * This classification is stable under DLM lock acquired by
1921 * ll_tree_lock() above, because to change class, other client has to
1922 * take DLM lock conflicting with our lock. Also, any updates to
1923 * ->i_size by other threads on this client are serialized by
1924 * ll_inode_size_lock(). This guarantees that short reads are handled
1925 * correctly in the face of concurrent writes and truncates.
1927 inode_init_lvb(inode, &lvb);
1928 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1930 if (*ppos + count - 1 > kms) {
1931 /* A glimpse is necessary to determine whether we return a
1932 * short read (B) or some zeroes at the end of the buffer (C) */
1933 ll_inode_size_unlock(inode, 1);
1934 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1938 /* region is within kms and, hence, within real file size (A) */
1939 i_size_write(inode, kms);
1940 ll_inode_size_unlock(inode, 1);
1943 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1944 inode->i_ino, count, *ppos, i_size_read(inode));
1946 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1947 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1948 ll_ra_read_in(in_file, &bead);
1950 file_accessed(in_file);
1951 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1952 ll_ra_read_ex(in_file, &bead);
1955 ll_tree_unlock(&tree);
1959 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1962 struct ll_inode_info *lli = ll_i2info(inode);
1963 struct obd_export *exp = ll_i2dtexp(inode);
1964 struct ll_recreate_obj ucreatp;
1965 struct obd_trans_info oti = { 0 };
1966 struct obdo *oa = NULL;
1969 struct lov_stripe_md *lsm, *lsm2;
1972 if (!capable (CAP_SYS_ADMIN))
1975 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1976 sizeof(struct ll_recreate_obj));
1984 down(&lli->lli_size_sem);
1987 GOTO(out, rc = -ENOENT);
1988 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1989 (lsm->lsm_stripe_count));
1991 OBD_ALLOC(lsm2, lsm_size);
1993 GOTO(out, rc = -ENOMEM);
1995 oa->o_id = ucreatp.lrc_id;
1996 oa->o_gr = ucreatp.lrc_group;
1997 oa->o_nlink = ucreatp.lrc_ost_idx;
1998 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1999 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
2000 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2001 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2003 memcpy(lsm2, lsm, lsm_size);
2004 rc = obd_create(exp, oa, &lsm2, &oti);
2006 OBD_FREE(lsm2, lsm_size);
2009 up(&lli->lli_size_sem);
2014 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
2015 int flags, struct lov_user_md *lum, int lum_size)
2017 struct ll_inode_info *lli = ll_i2info(inode);
2018 struct lov_stripe_md *lsm;
2019 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
2023 down(&lli->lli_size_sem);
2026 up(&lli->lli_size_sem);
2027 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
2032 rc = ll_intent_file_open(file, lum, lum_size, &oit);
2035 if (it_disposition(&oit, DISP_LOOKUP_NEG))
2036 GOTO(out_req_free, rc = -ENOENT);
2037 rc = oit.d.lustre.it_status;
2039 GOTO(out_req_free, rc);
2041 ll_release_openhandle(file->f_dentry, &oit);
2044 up(&lli->lli_size_sem);
2045 ll_intent_release(&oit);
2048 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2052 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2053 struct lov_mds_md **lmmp, int *lmm_size,
2054 struct ptlrpc_request **request)
2056 struct ll_sb_info *sbi = ll_i2sbi(inode);
2057 struct mdt_body *body;
2058 struct lov_mds_md *lmm = NULL;
2059 struct ptlrpc_request *req = NULL;
2060 struct obd_capa *oc;
2063 rc = ll_get_max_mdsize(sbi, &lmmsize);
2067 oc = ll_mdscapa_get(inode);
2068 rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
2069 oc, filename, strlen(filename) + 1,
2070 OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
2071 ll_i2suppgid(inode), &req);
2074 CDEBUG(D_INFO, "md_getattr_name failed "
2075 "on %s: rc %d\n", filename, rc);
2079 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2080 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2082 lmmsize = body->eadatasize;
2084 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2086 GOTO(out, rc = -ENODATA);
2089 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2090 LASSERT(lmm != NULL);
2093 * This is coming from the MDS, so is probably in
2094 * little endian. We convert it to host endian before
2095 * passing it to userspace.
2097 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
2098 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
2099 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
2100 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
2101 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
2104 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
2105 struct lov_stripe_md *lsm;
2106 struct lov_user_md_join *lmj;
2107 int lmj_size, i, aindex = 0;
2109 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
2111 GOTO(out, rc = -ENOMEM);
2112 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
2114 GOTO(out_free_memmd, rc);
2116 lmj_size = sizeof(struct lov_user_md_join) +
2117 lsm->lsm_stripe_count *
2118 sizeof(struct lov_user_ost_data_join);
2119 OBD_ALLOC(lmj, lmj_size);
2121 GOTO(out_free_memmd, rc = -ENOMEM);
2123 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
2124 for (i = 0; i < lsm->lsm_stripe_count; i++) {
2125 struct lov_extent *lex =
2126 &lsm->lsm_array->lai_ext_array[aindex];
2128 if (lex->le_loi_idx + lex->le_stripe_count <= i)
2130 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
2131 LPU64" len %d\n", aindex, i,
2132 lex->le_start, (int)lex->le_len);
2133 lmj->lmm_objects[i].l_extent_start =
2136 if ((int)lex->le_len == -1)
2137 lmj->lmm_objects[i].l_extent_end = -1;
2139 lmj->lmm_objects[i].l_extent_end =
2140 lex->le_start + lex->le_len;
2141 lmj->lmm_objects[i].l_object_id =
2142 lsm->lsm_oinfo[i]->loi_id;
2143 lmj->lmm_objects[i].l_object_gr =
2144 lsm->lsm_oinfo[i]->loi_gr;
2145 lmj->lmm_objects[i].l_ost_gen =
2146 lsm->lsm_oinfo[i]->loi_ost_gen;
2147 lmj->lmm_objects[i].l_ost_idx =
2148 lsm->lsm_oinfo[i]->loi_ost_idx;
2150 lmm = (struct lov_mds_md *)lmj;
2153 obd_free_memmd(sbi->ll_dt_exp, &lsm);
2157 *lmm_size = lmmsize;
2162 static int ll_lov_setea(struct inode *inode, struct file *file,
2165 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2166 struct lov_user_md *lump;
2167 int lum_size = sizeof(struct lov_user_md) +
2168 sizeof(struct lov_user_ost_data);
2172 if (!capable (CAP_SYS_ADMIN))
2175 OBD_ALLOC(lump, lum_size);
2179 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
2181 OBD_FREE(lump, lum_size);
2185 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
2187 OBD_FREE(lump, lum_size);
2191 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2194 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
2196 int flags = FMODE_WRITE;
2199 /* Bug 1152: copy properly when this is no longer true */
2200 LASSERT(sizeof(lum) == sizeof(*lump));
2201 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
2202 rc = copy_from_user(&lum, lump, sizeof(lum));
2206 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
2208 put_user(0, &lump->lmm_stripe_count);
2209 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
2210 0, ll_i2info(inode)->lli_smd, lump);
2215 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
2217 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2222 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
2226 static int ll_get_grouplock(struct inode *inode, struct file *file,
2229 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2230 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
2231 .end = OBD_OBJECT_EOF}};
2232 struct lustre_handle lockh = { 0 };
2233 struct ll_inode_info *lli = ll_i2info(inode);
2234 struct lov_stripe_md *lsm = lli->lli_smd;
2238 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2242 policy.l_extent.gid = arg;
2243 if (file->f_flags & O_NONBLOCK)
2244 flags = LDLM_FL_BLOCK_NOWAIT;
2246 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
2250 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
2252 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
2257 static int ll_put_grouplock(struct inode *inode, struct file *file,
2260 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2261 struct ll_inode_info *lli = ll_i2info(inode);
2262 struct lov_stripe_md *lsm = lli->lli_smd;
2266 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2267 /* Ugh, it's already unlocked. */
2271 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2274 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2276 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2281 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2286 static int join_sanity_check(struct inode *head, struct inode *tail)
2289 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2290 CERROR("server do not support join \n");
2293 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2294 CERROR("tail ino %lu and ino head %lu must be regular\n",
2295 head->i_ino, tail->i_ino);
2298 if (head->i_ino == tail->i_ino) {
2299 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2302 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2303 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2309 static int join_file(struct inode *head_inode, struct file *head_filp,
2310 struct file *tail_filp)
2312 struct dentry *tail_dentry = tail_filp->f_dentry;
2313 struct lookup_intent oit = {.it_op = IT_OPEN,
2314 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2315 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2316 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL };
2318 struct lustre_handle lockh;
2319 struct md_op_data *op_data;
2324 tail_dentry = tail_filp->f_dentry;
2326 data = i_size_read(head_inode);
2327 op_data = ll_prep_md_op_data(NULL, head_inode,
2328 tail_dentry->d_parent->d_inode,
2329 tail_dentry->d_name.name,
2330 tail_dentry->d_name.len, 0,
2331 LUSTRE_OPC_ANY, &data);
2332 if (IS_ERR(op_data))
2333 RETURN(PTR_ERR(op_data));
2335 rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2336 op_data, &lockh, NULL, 0, 0);
2338 ll_finish_md_op_data(op_data);
2342 rc = oit.d.lustre.it_status;
2344 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2345 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2346 ptlrpc_req_finished((struct ptlrpc_request *)
2347 oit.d.lustre.it_data);
2351 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2353 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2354 oit.d.lustre.it_lock_mode = 0;
2356 ll_release_openhandle(head_filp->f_dentry, &oit);
2358 ll_intent_release(&oit);
2362 static int ll_file_join(struct inode *head, struct file *filp,
2363 char *filename_tail)
2365 struct inode *tail = NULL, *first = NULL, *second = NULL;
2366 struct dentry *tail_dentry;
2367 struct file *tail_filp, *first_filp, *second_filp;
2368 struct ll_lock_tree first_tree, second_tree;
2369 struct ll_lock_tree_node *first_node, *second_node;
2370 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2371 int rc = 0, cleanup_phase = 0;
2374 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2375 head->i_ino, head->i_generation, head, filename_tail);
2377 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2378 if (IS_ERR(tail_filp)) {
2379 CERROR("Can not open tail file %s", filename_tail);
2380 rc = PTR_ERR(tail_filp);
2383 tail = igrab(tail_filp->f_dentry->d_inode);
2385 tlli = ll_i2info(tail);
2386 tail_dentry = tail_filp->f_dentry;
2387 LASSERT(tail_dentry);
2390 /*reorder the inode for lock sequence*/
2391 first = head->i_ino > tail->i_ino ? head : tail;
2392 second = head->i_ino > tail->i_ino ? tail : head;
2393 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2394 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2396 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2397 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2398 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2399 if (IS_ERR(first_node)){
2400 rc = PTR_ERR(first_node);
2403 first_tree.lt_fd = first_filp->private_data;
2404 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2409 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2410 if (IS_ERR(second_node)){
2411 rc = PTR_ERR(second_node);
2414 second_tree.lt_fd = second_filp->private_data;
2415 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2420 rc = join_sanity_check(head, tail);
2424 rc = join_file(head, filp, tail_filp);
2428 switch (cleanup_phase) {
2430 ll_tree_unlock(&second_tree);
2431 obd_cancel_unused(ll_i2dtexp(second),
2432 ll_i2info(second)->lli_smd, 0, NULL);
2434 ll_tree_unlock(&first_tree);
2435 obd_cancel_unused(ll_i2dtexp(first),
2436 ll_i2info(first)->lli_smd, 0, NULL);
2438 filp_close(tail_filp, 0);
2441 if (head && rc == 0) {
2442 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2444 hlli->lli_smd = NULL;
2449 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2455 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2457 struct inode *inode = dentry->d_inode;
2458 struct obd_client_handle *och;
2464 /* Root ? Do nothing. */
2465 if (dentry->d_inode->i_sb->s_root == dentry)
2468 /* No open handle to close? Move away */
2469 if (!it_disposition(it, DISP_OPEN_OPEN))
2472 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2474 OBD_ALLOC(och, sizeof(*och));
2476 GOTO(out, rc = -ENOMEM);
2478 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2479 ll_i2info(inode), it, och);
2481 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2484 /* this one is in place of ll_file_open */
2485 ptlrpc_req_finished(it->d.lustre.it_data);
2486 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2490 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2493 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2497 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2498 inode->i_generation, inode, cmd);
2499 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2501 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2502 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2506 case LL_IOC_GETFLAGS:
2507 /* Get the current value of the file flags */
2508 return put_user(fd->fd_flags, (int *)arg);
2509 case LL_IOC_SETFLAGS:
2510 case LL_IOC_CLRFLAGS:
2511 /* Set or clear specific file flags */
2512 /* XXX This probably needs checks to ensure the flags are
2513 * not abused, and to handle any flag side effects.
2515 if (get_user(flags, (int *) arg))
2518 if (cmd == LL_IOC_SETFLAGS) {
2519 if ((flags & LL_FILE_IGNORE_LOCK) &&
2520 !(file->f_flags & O_DIRECT)) {
2521 CERROR("%s: unable to disable locking on "
2522 "non-O_DIRECT file\n", current->comm);
2526 fd->fd_flags |= flags;
2528 fd->fd_flags &= ~flags;
2531 case LL_IOC_LOV_SETSTRIPE:
2532 RETURN(ll_lov_setstripe(inode, file, arg));
2533 case LL_IOC_LOV_SETEA:
2534 RETURN(ll_lov_setea(inode, file, arg));
2535 case LL_IOC_LOV_GETSTRIPE:
2536 RETURN(ll_lov_getstripe(inode, arg));
2537 case LL_IOC_RECREATE_OBJ:
2538 RETURN(ll_lov_recreate_obj(inode, file, arg));
2539 case EXT3_IOC_GETFLAGS:
2540 case EXT3_IOC_SETFLAGS:
2541 RETURN(ll_iocontrol(inode, file, cmd, arg));
2542 case EXT3_IOC_GETVERSION_OLD:
2543 case EXT3_IOC_GETVERSION:
2544 RETURN(put_user(inode->i_generation, (int *)arg));
2549 ftail = getname((const char *)arg);
2551 RETURN(PTR_ERR(ftail));
2552 rc = ll_file_join(inode, file, ftail);
2556 case LL_IOC_GROUP_LOCK:
2557 RETURN(ll_get_grouplock(inode, file, arg));
2558 case LL_IOC_GROUP_UNLOCK:
2559 RETURN(ll_put_grouplock(inode, file, arg));
2560 case IOC_OBD_STATFS:
2561 RETURN(ll_obd_statfs(inode, (void *)arg));
2563 /* We need to special case any other ioctls we want to handle,
2564 * to send them to the MDS/OST as appropriate and to properly
2565 * network encode the arg field.
2566 case EXT3_IOC_SETVERSION_OLD:
2567 case EXT3_IOC_SETVERSION:
2569 case LL_IOC_FLUSHCTX:
2570 RETURN(ll_flush_ctx(inode));
2575 ll_iocontrol_call(inode, file, cmd, arg, &err))
2578 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2584 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2586 struct inode *inode = file->f_dentry->d_inode;
2587 struct ll_inode_info *lli = ll_i2info(inode);
2588 struct lov_stripe_md *lsm = lli->lli_smd;
2591 retval = offset + ((origin == 2) ? i_size_read(inode) :
2592 (origin == 1) ? file->f_pos : 0);
2593 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2594 inode->i_ino, inode->i_generation, inode, retval, retval,
2595 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2596 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2598 if (origin == 2) { /* SEEK_END */
2599 int nonblock = 0, rc;
2601 if (file->f_flags & O_NONBLOCK)
2602 nonblock = LDLM_FL_BLOCK_NOWAIT;
2605 rc = ll_glimpse_size(inode, nonblock);
2610 ll_inode_size_lock(inode, 0);
2611 offset += i_size_read(inode);
2612 ll_inode_size_unlock(inode, 0);
2613 } else if (origin == 1) { /* SEEK_CUR */
2614 offset += file->f_pos;
2618 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2619 if (offset != file->f_pos) {
2620 file->f_pos = offset;
2621 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2623 file->f_version = ++event;
2632 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2634 struct inode *inode = dentry->d_inode;
2635 struct ll_inode_info *lli = ll_i2info(inode);
2636 struct lov_stripe_md *lsm = lli->lli_smd;
2637 struct ptlrpc_request *req;
2638 struct obd_capa *oc;
2641 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2642 inode->i_generation, inode);
2643 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2645 /* fsync's caller has already called _fdata{sync,write}, we want
2646 * that IO to finish before calling the osc and mdc sync methods */
2647 rc = filemap_fdatawait(inode->i_mapping);
2649 /* catch async errors that were recorded back when async writeback
2650 * failed for pages in this mapping. */
2651 err = lli->lli_async_rc;
2652 lli->lli_async_rc = 0;
2656 err = lov_test_and_clear_async_rc(lsm);
2661 oc = ll_mdscapa_get(inode);
2662 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2668 ptlrpc_req_finished(req);
2675 RETURN(rc ? rc : -ENOMEM);
2677 oa->o_id = lsm->lsm_object_id;
2678 oa->o_gr = lsm->lsm_object_gr;
2679 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2680 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2681 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2684 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2685 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2686 0, OBD_OBJECT_EOF, oc);
2696 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2698 struct inode *inode = file->f_dentry->d_inode;
2699 struct ll_sb_info *sbi = ll_i2sbi(inode);
2700 struct ldlm_res_id res_id =
2701 { .name = { fid_seq(ll_inode2fid(inode)),
2702 fid_oid(ll_inode2fid(inode)),
2703 fid_ver(ll_inode2fid(inode)),
2705 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2706 ldlm_flock_completion_ast, NULL, file_lock };
2707 struct lustre_handle lockh = {0};
2708 ldlm_policy_data_t flock;
2713 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2714 inode->i_ino, file_lock);
2716 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2718 if (file_lock->fl_flags & FL_FLOCK) {
2719 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2720 /* set missing params for flock() calls */
2721 file_lock->fl_end = OFFSET_MAX;
2722 file_lock->fl_pid = current->tgid;
2724 flock.l_flock.pid = file_lock->fl_pid;
2725 flock.l_flock.start = file_lock->fl_start;
2726 flock.l_flock.end = file_lock->fl_end;
2728 switch (file_lock->fl_type) {
2730 einfo.ei_mode = LCK_PR;
2733 /* An unlock request may or may not have any relation to
2734 * existing locks so we may not be able to pass a lock handle
2735 * via a normal ldlm_lock_cancel() request. The request may even
2736 * unlock a byte range in the middle of an existing lock. In
2737 * order to process an unlock request we need all of the same
2738 * information that is given with a normal read or write record
2739 * lock request. To avoid creating another ldlm unlock (cancel)
2740 * message we'll treat a LCK_NL flock request as an unlock. */
2741 einfo.ei_mode = LCK_NL;
2744 einfo.ei_mode = LCK_PW;
2747 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2762 flags = LDLM_FL_BLOCK_NOWAIT;
2768 flags = LDLM_FL_TEST_LOCK;
2769 /* Save the old mode so that if the mode in the lock changes we
2770 * can decrement the appropriate reader or writer refcount. */
2771 file_lock->fl_type = einfo.ei_mode;
2774 CERROR("unknown fcntl lock command: %d\n", cmd);
2778 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2779 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2780 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2782 rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &einfo, &res_id,
2783 &flock, &flags, NULL, 0, NULL, &lockh, 0);
2784 if ((file_lock->fl_flags & FL_FLOCK) &&
2785 (rc == 0 || file_lock->fl_type == F_UNLCK))
2786 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2787 #ifdef HAVE_F_OP_FLOCK
2788 if ((file_lock->fl_flags & FL_POSIX) &&
2789 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2790 !(flags & LDLM_FL_TEST_LOCK))
2791 posix_lock_file_wait(file, file_lock);
2797 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2804 int ll_have_md_lock(struct inode *inode, __u64 bits)
2806 struct lustre_handle lockh;
2807 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2815 fid = &ll_i2info(inode)->lli_fid;
2816 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2818 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2819 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2820 LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2826 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2827 struct lustre_handle *lockh)
2829 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2835 fid = &ll_i2info(inode)->lli_fid;
2836 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2838 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2839 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2840 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2844 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2845 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2846 * and return success */
2848 /* This path cannot be hit for regular files unless in
2849 * case of obscure races, so no need to to validate
2851 if (!S_ISREG(inode->i_mode) &&
2852 !S_ISDIR(inode->i_mode))
2857 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2865 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2867 struct inode *inode = dentry->d_inode;
2868 struct ptlrpc_request *req = NULL;
2869 struct ll_sb_info *sbi;
2870 struct obd_export *exp;
2875 CERROR("REPORT THIS LINE TO PETER\n");
2878 sbi = ll_i2sbi(inode);
2880 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2881 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2883 exp = ll_i2mdexp(inode);
2885 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2886 struct lookup_intent oit = { .it_op = IT_GETATTR };
2887 struct md_op_data *op_data;
2889 /* Call getattr by fid, so do not provide name at all. */
2890 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2891 dentry->d_inode, NULL, 0, 0,
2892 LUSTRE_OPC_ANY, NULL);
2893 if (IS_ERR(op_data))
2894 RETURN(PTR_ERR(op_data));
2896 oit.it_flags |= O_CHECK_STALE;
2897 rc = md_intent_lock(exp, op_data, NULL, 0,
2898 /* we are not interested in name
2901 ll_md_blocking_ast, 0);
2902 ll_finish_md_op_data(op_data);
2903 oit.it_flags &= ~O_CHECK_STALE;
2905 rc = ll_inode_revalidate_fini(inode, rc);
2909 rc = ll_revalidate_it_finish(req, &oit, dentry);
2911 ll_intent_release(&oit);
2915 /* Unlinked? Unhash dentry, so it is not picked up later by
2916 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2917 here to preserve get_cwd functionality on 2.6.
2919 if (!dentry->d_inode->i_nlink) {
2920 spin_lock(&dcache_lock);
2921 ll_drop_dentry(dentry);
2922 spin_unlock(&dcache_lock);
2925 ll_lookup_finish_locks(&oit, dentry);
2926 } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
2927 MDS_INODELOCK_LOOKUP)) {
2928 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2929 obd_valid valid = OBD_MD_FLGETATTR;
2930 struct obd_capa *oc;
2933 if (S_ISREG(inode->i_mode)) {
2934 rc = ll_get_max_mdsize(sbi, &ealen);
2937 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2939 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2940 * capa for this inode. Because we only keep capas of dirs
2942 oc = ll_mdscapa_get(inode);
2943 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2947 rc = ll_inode_revalidate_fini(inode, rc);
2951 rc = ll_prep_inode(&inode, req, NULL);
2956 /* if object not yet allocated, don't validate size */
2957 if (ll_i2info(inode)->lli_smd == NULL)
2960 /* ll_glimpse_size will prefer locally cached writes if they extend
2962 rc = ll_glimpse_size(inode, 0);
2965 ptlrpc_req_finished(req);
2969 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2970 struct lookup_intent *it, struct kstat *stat)
2972 struct inode *inode = de->d_inode;
2975 res = ll_inode_revalidate_it(de, it);
2976 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2981 stat->dev = inode->i_sb->s_dev;
2982 stat->ino = inode->i_ino;
2983 stat->mode = inode->i_mode;
2984 stat->nlink = inode->i_nlink;
2985 stat->uid = inode->i_uid;
2986 stat->gid = inode->i_gid;
2987 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2988 stat->atime = inode->i_atime;
2989 stat->mtime = inode->i_mtime;
2990 stat->ctime = inode->i_ctime;
2991 #ifdef HAVE_INODE_BLKSIZE
2992 stat->blksize = inode->i_blksize;
2994 stat->blksize = 1 << inode->i_blkbits;
2997 ll_inode_size_lock(inode, 0);
2998 stat->size = i_size_read(inode);
2999 stat->blocks = inode->i_blocks;
3000 ll_inode_size_unlock(inode, 0);
3004 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3006 struct lookup_intent it = { .it_op = IT_GETATTR };
3008 return ll_getattr_it(mnt, de, &it, stat);
3012 int lustre_check_acl(struct inode *inode, int mask)
3014 #ifdef CONFIG_FS_POSIX_ACL
3015 struct ll_inode_info *lli = ll_i2info(inode);
3016 struct posix_acl *acl;
3020 spin_lock(&lli->lli_lock);
3021 acl = posix_acl_dup(lli->lli_posix_acl);
3022 spin_unlock(&lli->lli_lock);
3027 rc = posix_acl_permission(inode, acl, mask);
3028 posix_acl_release(acl);
3036 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
3037 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3039 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3040 inode->i_ino, inode->i_generation, inode, mask);
3041 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3042 return lustre_check_remote_perm(inode, mask);
3044 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3045 return generic_permission(inode, mask, lustre_check_acl);
3048 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3050 int mode = inode->i_mode;
3053 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3054 inode->i_ino, inode->i_generation, inode, mask);
3056 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3057 return lustre_check_remote_perm(inode, mask);
3059 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3061 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
3062 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
3064 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
3066 if (current->fsuid == inode->i_uid) {
3069 if (((mode >> 3) & mask & S_IRWXO) != mask)
3071 rc = lustre_check_acl(inode, mask);
3075 goto check_capabilities;
3079 if (in_group_p(inode->i_gid))
3082 if ((mode & mask & S_IRWXO) == mask)
3086 if (!(mask & MAY_EXEC) ||
3087 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3088 if (capable(CAP_DAC_OVERRIDE))
3091 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
3092 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
3099 /* -o localflock - only provides locally consistent flock locks */
3100 struct file_operations ll_file_operations = {
3101 .read = ll_file_read,
3102 .write = ll_file_write,
3103 .ioctl = ll_file_ioctl,
3104 .open = ll_file_open,
3105 .release = ll_file_release,
3106 .mmap = ll_file_mmap,
3107 .llseek = ll_file_seek,
3108 .sendfile = ll_file_sendfile,
3112 struct file_operations ll_file_operations_flock = {
3113 .read = ll_file_read,
3114 .write = ll_file_write,
3115 .ioctl = ll_file_ioctl,
3116 .open = ll_file_open,
3117 .release = ll_file_release,
3118 .mmap = ll_file_mmap,
3119 .llseek = ll_file_seek,
3120 .sendfile = ll_file_sendfile,
3122 #ifdef HAVE_F_OP_FLOCK
3123 .flock = ll_file_flock,
3125 .lock = ll_file_flock
3128 /* These are for -o noflock - to return ENOSYS on flock calls */
3129 struct file_operations ll_file_operations_noflock = {
3130 .read = ll_file_read,
3131 .write = ll_file_write,
3132 .ioctl = ll_file_ioctl,
3133 .open = ll_file_open,
3134 .release = ll_file_release,
3135 .mmap = ll_file_mmap,
3136 .llseek = ll_file_seek,
3137 .sendfile = ll_file_sendfile,
3139 #ifdef HAVE_F_OP_FLOCK
3140 .flock = ll_file_noflock,
3142 .lock = ll_file_noflock
3145 struct inode_operations ll_file_inode_operations = {
3146 #ifdef HAVE_VFS_INTENT_PATCHES
3147 .setattr_raw = ll_setattr_raw,
3149 .setattr = ll_setattr,
3150 .truncate = ll_truncate,
3151 .getattr = ll_getattr,
3152 .permission = ll_inode_permission,
3153 .setxattr = ll_setxattr,
3154 .getxattr = ll_getxattr,
3155 .listxattr = ll_listxattr,
3156 .removexattr = ll_removexattr,
3159 /* dynamic ioctl number support routins */
3160 static struct llioc_ctl_data {
3161 struct rw_semaphore ioc_sem;
3162 struct list_head ioc_head;
3164 __RWSEM_INITIALIZER(llioc.ioc_sem),
3165 CFS_LIST_HEAD_INIT(llioc.ioc_head)
3170 struct list_head iocd_list;
3171 unsigned int iocd_size;
3172 llioc_callback_t iocd_cb;
3173 unsigned int iocd_count;
3174 unsigned int iocd_cmd[0];
3177 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3180 struct llioc_data *in_data = NULL;
3183 if (cb == NULL || cmd == NULL ||
3184 count > LLIOC_MAX_CMD || count < 0)
3187 size = sizeof(*in_data) + count * sizeof(unsigned int);
3188 OBD_ALLOC(in_data, size);
3189 if (in_data == NULL)
3192 memset(in_data, 0, sizeof(*in_data));
3193 in_data->iocd_size = size;
3194 in_data->iocd_cb = cb;
3195 in_data->iocd_count = count;
3196 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3198 down_write(&llioc.ioc_sem);
3199 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3200 up_write(&llioc.ioc_sem);
3205 void ll_iocontrol_unregister(void *magic)
3207 struct llioc_data *tmp;
3212 down_write(&llioc.ioc_sem);
3213 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3215 unsigned int size = tmp->iocd_size;
3217 list_del(&tmp->iocd_list);
3218 up_write(&llioc.ioc_sem);
3220 OBD_FREE(tmp, size);
3224 up_write(&llioc.ioc_sem);
3226 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3229 EXPORT_SYMBOL(ll_iocontrol_register);
3230 EXPORT_SYMBOL(ll_iocontrol_unregister);
3232 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3233 unsigned int cmd, unsigned long arg, int *rcp)
3235 enum llioc_iter ret = LLIOC_CONT;
3236 struct llioc_data *data;
3237 int rc = -EINVAL, i;
3239 down_read(&llioc.ioc_sem);
3240 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3241 for (i = 0; i < data->iocd_count; i++) {
3242 if (cmd != data->iocd_cmd[i])
3245 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3249 if (ret == LLIOC_STOP)
3252 up_read(&llioc.ioc_sem);