1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <lustre_mdc.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include "llite_internal.h"
33 /* also used by llite/special.c:ll_special_open() */
34 struct ll_file_data *ll_file_data_get(void)
36 struct ll_file_data *fd;
38 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
42 static void ll_file_data_put(struct ll_file_data *fd)
45 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
48 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
49 struct lustre_handle *fh)
51 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
52 op_data->op_attr.ia_mode = inode->i_mode;
53 op_data->op_attr.ia_atime = inode->i_atime;
54 op_data->op_attr.ia_mtime = inode->i_mtime;
55 op_data->op_attr.ia_ctime = inode->i_ctime;
56 op_data->op_attr.ia_size = i_size_read(inode);
57 op_data->op_attr_blocks = inode->i_blocks;
58 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
59 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
60 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
61 op_data->op_capa1 = ll_mdscapa_get(inode);
64 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
65 struct obd_client_handle *och)
69 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
70 ATTR_MTIME_SET | ATTR_CTIME_SET;
72 if (!(och->och_flags & FMODE_WRITE))
75 if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
76 !S_ISREG(inode->i_mode))
77 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
79 ll_epoch_close(inode, op_data, &och, 0);
82 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
86 static int ll_close_inode_openhandle(struct obd_export *md_exp,
88 struct obd_client_handle *och)
90 struct obd_export *exp = ll_i2mdexp(inode);
91 struct md_op_data *op_data;
92 struct ptlrpc_request *req = NULL;
93 struct obd_device *obd = class_exp2obd(exp);
100 * XXX: in case of LMV, is this correct to access
103 CERROR("Invalid MDC connection handle "LPX64"\n",
104 ll_i2mdexp(inode)->exp_handle.h_cookie);
109 * here we check if this is forced umount. If so this is called on
110 * canceling "open lock" and we do not call md_close() in this case, as
111 * it will not be successful, as import is already deactivated.
116 OBD_ALLOC_PTR(op_data);
118 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
120 ll_prepare_close(inode, op_data, och);
121 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
122 rc = md_close(md_exp, op_data, och->och_mod, &req);
127 /* This close must have the epoch closed. */
128 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
129 LASSERT(epoch_close);
130 /* MDS has instructed us to obtain Size-on-MDS attribute from
131 * OSTs and send setattr to back to MDS. */
132 rc = ll_sizeonmds_update(inode, och->och_mod,
133 &och->och_fh, op_data->op_ioepoch);
135 CERROR("inode %lu mdc Size-on-MDS update failed: "
136 "rc = %d\n", inode->i_ino, rc);
140 CERROR("inode %lu mdc close failed: rc = %d\n",
143 ll_finish_md_op_data(op_data);
146 rc = ll_objects_destroy(req, inode);
148 CERROR("inode %lu ll_objects destroy: rc = %d\n",
155 if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
156 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
157 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
160 ptlrpc_close_replay_seq(req);
161 md_clear_open_replay_data(md_exp, och);
162 /* Free @och if it is not waiting for DONE_WRITING. */
163 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
166 if (req) /* This is close request */
167 ptlrpc_req_finished(req);
171 int ll_md_real_close(struct inode *inode, int flags)
173 struct ll_inode_info *lli = ll_i2info(inode);
174 struct obd_client_handle **och_p;
175 struct obd_client_handle *och;
180 if (flags & FMODE_WRITE) {
181 och_p = &lli->lli_mds_write_och;
182 och_usecount = &lli->lli_open_fd_write_count;
183 } else if (flags & FMODE_EXEC) {
184 och_p = &lli->lli_mds_exec_och;
185 och_usecount = &lli->lli_open_fd_exec_count;
187 LASSERT(flags & FMODE_READ);
188 och_p = &lli->lli_mds_read_och;
189 och_usecount = &lli->lli_open_fd_read_count;
192 down(&lli->lli_och_sem);
193 if (*och_usecount) { /* There are still users of this handle, so
195 up(&lli->lli_och_sem);
200 up(&lli->lli_och_sem);
202 if (och) { /* There might be a race and somebody have freed this och
204 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
211 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
214 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
215 struct ll_inode_info *lli = ll_i2info(inode);
219 /* clear group lock, if present */
220 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
221 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
222 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
223 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
227 /* Let's see if we have good enough OPEN lock on the file and if
228 we can skip talking to MDS */
229 if (file->f_dentry->d_inode) { /* Can this ever be false? */
231 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
232 struct lustre_handle lockh;
233 struct inode *inode = file->f_dentry->d_inode;
234 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
236 down(&lli->lli_och_sem);
237 if (fd->fd_omode & FMODE_WRITE) {
239 LASSERT(lli->lli_open_fd_write_count);
240 lli->lli_open_fd_write_count--;
241 } else if (fd->fd_omode & FMODE_EXEC) {
243 LASSERT(lli->lli_open_fd_exec_count);
244 lli->lli_open_fd_exec_count--;
247 LASSERT(lli->lli_open_fd_read_count);
248 lli->lli_open_fd_read_count--;
250 up(&lli->lli_och_sem);
252 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
253 LDLM_IBITS, &policy, lockmode,
255 rc = ll_md_real_close(file->f_dentry->d_inode,
259 CERROR("Releasing a file %p with negative dentry %p. Name %s",
260 file, file->f_dentry, file->f_dentry->d_name.name);
263 LUSTRE_FPRIVATE(file) = NULL;
264 ll_file_data_put(fd);
265 ll_capa_close(inode);
270 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
272 /* While this returns an error code, fput() the caller does not, so we need
273 * to make every effort to clean up all of our state here. Also, applications
274 * rarely check close errors and even if an error is returned they will not
275 * re-try the close call.
277 int ll_file_release(struct inode *inode, struct file *file)
279 struct ll_file_data *fd;
280 struct ll_sb_info *sbi = ll_i2sbi(inode);
281 struct ll_inode_info *lli = ll_i2info(inode);
282 struct lov_stripe_md *lsm = lli->lli_smd;
286 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
287 inode->i_generation, inode);
289 #ifdef CONFIG_FS_POSIX_ACL
290 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
291 inode == inode->i_sb->s_root->d_inode) {
292 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
295 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
296 fd->fd_flags &= ~LL_FILE_RMTACL;
297 rct_del(&sbi->ll_rct, cfs_curproc_pid());
298 et_search_free(&sbi->ll_et, cfs_curproc_pid());
303 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
304 fd = LUSTRE_FPRIVATE(file);
307 /* don't do anything for / */
308 if (inode->i_sb->s_root == file->f_dentry) {
309 LUSTRE_FPRIVATE(file) = NULL;
310 ll_file_data_put(fd);
315 lov_test_and_clear_async_rc(lsm);
316 lli->lli_async_rc = 0;
318 rc = ll_md_close(sbi->ll_md_exp, inode, file);
322 static int ll_intent_file_open(struct file *file, void *lmm,
323 int lmmsize, struct lookup_intent *itp)
325 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
326 struct dentry *parent = file->f_dentry->d_parent;
327 const char *name = file->f_dentry->d_name.name;
328 const int len = file->f_dentry->d_name.len;
329 struct md_op_data *op_data;
330 struct ptlrpc_request *req;
336 /* Usually we come here only for NFSD, and we want open lock.
337 But we can also get here with pre 2.6.15 patchless kernels, and in
338 that case that lock is also ok */
339 /* We can also get here if there was cached open handle in revalidate_it
340 * but it disappeared while we were getting from there to ll_file_open.
341 * But this means this file was closed and immediatelly opened which
342 * makes a good candidate for using OPEN lock */
343 /* If lmmsize & lmm are not 0, we are just setting stripe info
344 * parameters. No need for the open lock */
345 if (!lmm && !lmmsize)
346 itp->it_flags |= MDS_OPEN_LOCK;
348 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
349 file->f_dentry->d_inode, name, len,
350 O_RDWR, LUSTRE_OPC_ANY, NULL);
352 RETURN(PTR_ERR(op_data));
354 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
355 0 /*unused */, &req, ll_md_blocking_ast, 0);
356 ll_finish_md_op_data(op_data);
358 /* reason for keep own exit path - don`t flood log
359 * with messages with -ESTALE errors.
361 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
362 it_open_error(DISP_OPEN_OPEN, itp))
364 ll_release_openhandle(file->f_dentry, itp);
368 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
369 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
370 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
374 if (itp->d.lustre.it_lock_mode)
375 md_set_lock_data(sbi->ll_md_exp,
376 &itp->d.lustre.it_lock_handle,
377 file->f_dentry->d_inode);
379 rc = ll_prep_inode(&file->f_dentry->d_inode, req, DLM_REPLY_REC_OFF,
382 ptlrpc_req_finished(itp->d.lustre.it_data);
385 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
386 ll_intent_drop_lock(itp);
391 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
392 struct lookup_intent *it, struct obd_client_handle *och)
394 struct ptlrpc_request *req = it->d.lustre.it_data;
395 struct mdt_body *body;
399 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
400 /* reply already checked out */
401 LASSERT(body != NULL);
402 /* and swabbed in md_enqueue */
403 LASSERT(lustre_rep_swabbed(req, DLM_REPLY_REC_OFF));
405 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
406 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
407 och->och_fid = lli->lli_fid;
408 och->och_flags = it->it_flags;
409 lli->lli_ioepoch = body->ioepoch;
411 return md_set_open_replay_data(md_exp, och, req);
414 int ll_local_open(struct file *file, struct lookup_intent *it,
415 struct ll_file_data *fd, struct obd_client_handle *och)
417 struct inode *inode = file->f_dentry->d_inode;
418 struct ll_inode_info *lli = ll_i2info(inode);
421 LASSERT(!LUSTRE_FPRIVATE(file));
426 struct ptlrpc_request *req = it->d.lustre.it_data;
427 struct mdt_body *body;
430 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
434 body = lustre_msg_buf(req->rq_repmsg,
435 DLM_REPLY_REC_OFF, sizeof(*body));
437 if ((it->it_flags & FMODE_WRITE) &&
438 (body->valid & OBD_MD_FLSIZE))
440 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
441 lli->lli_ioepoch, PFID(&lli->lli_fid));
445 LUSTRE_FPRIVATE(file) = fd;
446 ll_readahead_init(inode, &fd->fd_ras);
447 fd->fd_omode = it->it_flags;
451 /* Open a file, and (for the very first open) create objects on the OSTs at
452 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
453 * creation or open until ll_lov_setstripe() ioctl is called. We grab
454 * lli_open_sem to ensure no other process will create objects, send the
455 * stripe MD to the MDS, or try to destroy the objects if that fails.
457 * If we already have the stripe MD locally then we don't request it in
458 * md_open(), by passing a lmm_size = 0.
460 * It is up to the application to ensure no other processes open this file
461 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
462 * used. We might be able to avoid races of that sort by getting lli_open_sem
463 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
464 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
466 int ll_file_open(struct inode *inode, struct file *file)
468 struct ll_inode_info *lli = ll_i2info(inode);
469 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
470 .it_flags = file->f_flags };
471 struct lov_stripe_md *lsm;
472 struct ptlrpc_request *req = NULL;
473 struct obd_client_handle **och_p;
475 struct ll_file_data *fd;
479 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
480 inode->i_generation, inode, file->f_flags);
482 #ifdef HAVE_VFS_INTENT_PATCHES
485 it = file->private_data; /* XXX: compat macro */
486 file->private_data = NULL; /* prevent ll_local_open assertion */
489 fd = ll_file_data_get();
493 /* don't do anything for / */
494 if (inode->i_sb->s_root == file->f_dentry) {
495 LUSTRE_FPRIVATE(file) = fd;
499 if (!it || !it->d.lustre.it_disposition) {
500 /* Convert f_flags into access mode. We cannot use file->f_mode,
501 * because everything but O_ACCMODE mask was stripped from
503 if ((oit.it_flags + 1) & O_ACCMODE)
505 if (file->f_flags & O_TRUNC)
506 oit.it_flags |= FMODE_WRITE;
508 /* kernel only call f_op->open in dentry_open. filp_open calls
509 * dentry_open after call to open_namei that checks permissions.
510 * Only nfsd_open call dentry_open directly without checking
511 * permissions and because of that this code below is safe. */
512 if (oit.it_flags & FMODE_WRITE)
513 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
515 /* We do not want O_EXCL here, presumably we opened the file
516 * already? XXX - NFS implications? */
517 oit.it_flags &= ~O_EXCL;
523 /* Let's see if we have file open on MDS already. */
524 if (it->it_flags & FMODE_WRITE) {
525 och_p = &lli->lli_mds_write_och;
526 och_usecount = &lli->lli_open_fd_write_count;
527 } else if (it->it_flags & FMODE_EXEC) {
528 och_p = &lli->lli_mds_exec_och;
529 och_usecount = &lli->lli_open_fd_exec_count;
531 och_p = &lli->lli_mds_read_och;
532 och_usecount = &lli->lli_open_fd_read_count;
535 down(&lli->lli_och_sem);
536 if (*och_p) { /* Open handle is present */
537 if (it_disposition(it, DISP_OPEN_OPEN)) {
538 /* Well, there's extra open request that we do not need,
539 let's close it somehow. This will decref request. */
540 rc = it_open_error(DISP_OPEN_OPEN, it);
542 ll_file_data_put(fd);
543 GOTO(out_och_free, rc);
545 ll_release_openhandle(file->f_dentry, it);
546 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
551 rc = ll_local_open(file, it, fd, NULL);
553 up(&lli->lli_och_sem);
554 ll_file_data_put(fd);
558 LASSERT(*och_usecount == 0);
559 if (!it->d.lustre.it_disposition) {
560 /* We cannot just request lock handle now, new ELC code
561 means that one of other OPEN locks for this file
562 could be cancelled, and since blocking ast handler
563 would attempt to grab och_sem as well, that would
564 result in a deadlock */
565 up(&lli->lli_och_sem);
566 it->it_flags |= O_CHECK_STALE;
567 rc = ll_intent_file_open(file, NULL, 0, it);
568 it->it_flags &= ~O_CHECK_STALE;
570 ll_file_data_put(fd);
571 GOTO(out_openerr, rc);
574 /* Got some error? Release the request */
575 if (it->d.lustre.it_status < 0) {
576 req = it->d.lustre.it_data;
577 ptlrpc_req_finished(req);
579 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
580 &it->d.lustre.it_lock_handle,
581 file->f_dentry->d_inode);
584 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
586 ll_file_data_put(fd);
587 GOTO(out_och_free, rc = -ENOMEM);
590 req = it->d.lustre.it_data;
592 /* md_intent_lock() didn't get a request ref if there was an
593 * open error, so don't do cleanup on the request here
595 /* XXX (green): Should not we bail out on any error here, not
596 * just open error? */
597 rc = it_open_error(DISP_OPEN_OPEN, it);
599 ll_file_data_put(fd);
600 GOTO(out_och_free, rc);
603 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
604 rc = ll_local_open(file, it, fd, *och_p);
606 up(&lli->lli_och_sem);
607 ll_file_data_put(fd);
608 GOTO(out_och_free, rc);
611 up(&lli->lli_och_sem);
613 /* Must do this outside lli_och_sem lock to prevent deadlock where
614 different kind of OPEN lock for this same inode gets cancelled
615 by ldlm_cancel_lru */
616 if (!S_ISREG(inode->i_mode))
623 if (file->f_flags & O_LOV_DELAY_CREATE ||
624 !(file->f_mode & FMODE_WRITE)) {
625 CDEBUG(D_INODE, "object creation was delayed\n");
629 file->f_flags &= ~O_LOV_DELAY_CREATE;
632 ptlrpc_req_finished(req);
634 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
638 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
639 *och_p = NULL; /* OBD_FREE writes some magic there */
642 up(&lli->lli_och_sem);
643 out_openerr: ;/* Looks weierd, eh? Just wait for statahead code to insert
644 a statement here <-- remove this comment after statahead
651 /* Fills the obdo with the attributes for the inode defined by lsm */
652 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
654 struct ptlrpc_request_set *set;
655 struct ll_inode_info *lli = ll_i2info(inode);
656 struct lov_stripe_md *lsm = lli->lli_smd;
658 struct obd_info oinfo = { { { 0 } } };
662 LASSERT(lsm != NULL);
666 oinfo.oi_oa->o_id = lsm->lsm_object_id;
667 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
668 oinfo.oi_oa->o_mode = S_IFREG;
669 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
670 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
671 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
672 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
674 oinfo.oi_capa = ll_mdscapa_get(inode);
676 set = ptlrpc_prep_set();
678 CERROR("can't allocate ptlrpc set\n");
681 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
683 rc = ptlrpc_set_wait(set);
684 ptlrpc_set_destroy(set);
686 capa_put(oinfo.oi_capa);
690 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
691 OBD_MD_FLATIME | OBD_MD_FLMTIME |
692 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
694 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
695 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
696 lli->lli_smd->lsm_object_id, i_size_read(inode),
697 inode->i_blocks, inode->i_blksize);
701 static inline void ll_remove_suid(struct inode *inode)
705 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
706 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
708 /* was any of the uid bits set? */
709 mode &= inode->i_mode;
710 if (mode && !capable(CAP_FSETID)) {
711 inode->i_mode &= ~mode;
712 // XXX careful here - we cannot change the size
716 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
718 struct ll_inode_info *lli = ll_i2info(inode);
719 struct lov_stripe_md *lsm = lli->lli_smd;
720 struct obd_export *exp = ll_i2dtexp(inode);
723 struct ldlm_lock *lock;
724 struct lov_stripe_md *lsm;
725 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
726 __u32 stripe, vallen = sizeof(stripe);
730 if (lsm->lsm_stripe_count == 1)
731 GOTO(check, stripe = 0);
733 /* get our offset in the lov */
734 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
736 CERROR("obd_get_info: rc = %d\n", rc);
739 LASSERT(stripe < lsm->lsm_stripe_count);
742 if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
743 lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
744 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
745 lsm->lsm_oinfo[stripe]->loi_id,
746 lsm->lsm_oinfo[stripe]->loi_gr);
747 RETURN(-ELDLM_NO_LOCK_DATA);
753 /* Flush the page cache for an extent as its canceled. When we're on an LOV,
754 * we get a lock cancellation for each stripe, so we have to map the obd's
755 * region back onto the stripes in the file that it held.
757 * No one can dirty the extent until we've finished our work and they can
758 * enqueue another lock. The DLM protects us from ll_file_read/write here,
759 * but other kernel actors could have pages locked.
761 * Called with the DLM lock held. */
762 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
763 struct ldlm_lock *lock, __u32 stripe)
765 ldlm_policy_data_t tmpex;
766 unsigned long start, end, count, skip, i, j;
768 int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
769 struct lustre_handle lockh;
770 struct address_space *mapping = inode->i_mapping;
773 tmpex = lock->l_policy_data;
774 CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
775 inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
778 /* our locks are page granular thanks to osc_enqueue, we invalidate the
780 if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
781 ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
782 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",
784 LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
785 LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
789 start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
790 end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
791 if (lsm->lsm_stripe_count > 1) {
792 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
793 skip = (lsm->lsm_stripe_count - 1) * count;
794 start += start/count * skip + stripe * count;
796 end += end/count * skip + stripe * count;
798 if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
801 i = i_size_read(inode) ? (__u64)(i_size_read(inode) - 1) >>
806 CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
807 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
808 count, skip, end, discard ? " (DISCARDING)" : "");
810 /* walk through the vmas on the inode and tear down mmaped pages that
811 * intersect with the lock. this stops immediately if there are no
812 * mmap()ed regions of the file. This is not efficient at all and
813 * should be short lived. We'll associate mmap()ed pages with the lock
814 * and will be able to find them directly */
815 for (i = start; i <= end; i += (j + skip)) {
816 j = min(count - (i % count), end - i + 1);
819 if (ll_teardown_mmaps(mapping,
820 (__u64)i << CFS_PAGE_SHIFT,
821 ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
825 /* this is the simplistic implementation of page eviction at
826 * cancelation. It is careful to get races with other page
827 * lockers handled correctly. fixes from bug 20 will make it
828 * more efficient by associating locks with pages and with
829 * batching writeback under the lock explicitly. */
830 for (i = start, j = start % count; i <= end;
831 j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
833 CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
839 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
840 LPU64" >= "LPU64" start %lu i %lu end %lu\n",
841 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
844 if (!mapping_has_pages(mapping)) {
845 CDEBUG(D_INODE|D_PAGE, "nothing left\n");
851 page = find_get_page(mapping, i);
854 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
855 i, tmpex.l_extent.start);
858 /* page->mapping to check with racing against teardown */
859 if (!discard && clear_page_dirty_for_io(page)) {
860 rc = ll_call_writepage(inode, page);
861 /* either waiting for io to complete or reacquiring
862 * the lock that the failed writepage released */
864 wait_on_page_writeback(page);
866 CERROR("writepage inode %lu(%p) of page %p "
867 "failed: %d\n", inode->i_ino, inode,
870 set_bit(AS_ENOSPC, &mapping->flags);
872 set_bit(AS_EIO, &mapping->flags);
876 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
877 /* check to see if another DLM lock covers this page b=2765 */
878 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
879 LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
881 &lock->l_resource->lr_name, LDLM_EXTENT,
882 &tmpex, LCK_PR | LCK_PW, &lockh);
884 if (rc2 <= 0 && page->mapping != NULL) {
885 struct ll_async_page *llap = llap_cast_private(page);
886 /* checking again to account for writeback's
888 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
890 ll_ra_accounting(llap, mapping);
891 ll_truncate_complete_page(page);
894 page_cache_release(page);
896 LASSERTF(tmpex.l_extent.start <=
897 (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
898 lock->l_policy_data.l_extent.end + 1),
899 "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
900 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
905 static int ll_extent_lock_callback(struct ldlm_lock *lock,
906 struct ldlm_lock_desc *new, void *data,
909 struct lustre_handle lockh = { 0 };
913 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
914 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
919 case LDLM_CB_BLOCKING:
920 ldlm_lock2handle(lock, &lockh);
921 rc = ldlm_cli_cancel(&lockh);
923 CERROR("ldlm_cli_cancel failed: %d\n", rc);
925 case LDLM_CB_CANCELING: {
927 struct ll_inode_info *lli;
928 struct lov_stripe_md *lsm;
932 /* This lock wasn't granted, don't try to evict pages */
933 if (lock->l_req_mode != lock->l_granted_mode)
936 inode = ll_inode_from_lock(lock);
939 lli = ll_i2info(inode);
942 if (lli->lli_smd == NULL)
946 stripe = ll_lock_to_stripe_offset(inode, lock);
950 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
952 lov_stripe_lock(lsm);
953 lock_res_and_lock(lock);
954 kms = ldlm_extent_shift_kms(lock,
955 lsm->lsm_oinfo[stripe]->loi_kms);
957 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
958 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
959 lsm->lsm_oinfo[stripe]->loi_kms, kms);
960 lsm->lsm_oinfo[stripe]->loi_kms = kms;
961 unlock_res_and_lock(lock);
962 lov_stripe_unlock(lsm);
975 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
977 /* XXX ALLOCATE - 160 bytes */
978 struct inode *inode = ll_inode_from_lock(lock);
979 struct ll_inode_info *lli = ll_i2info(inode);
980 struct lustre_handle lockh = { 0 };
985 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
986 LDLM_FL_BLOCK_CONV)) {
987 LBUG(); /* not expecting any blocked async locks yet */
988 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
990 ldlm_lock_dump(D_OTHER, lock, 0);
991 ldlm_reprocess_all(lock->l_resource);
995 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
997 stripe = ll_lock_to_stripe_offset(inode, lock);
1001 if (lock->l_lvb_len) {
1002 struct lov_stripe_md *lsm = lli->lli_smd;
1004 lvb = lock->l_lvb_data;
1005 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
1007 lock_res_and_lock(lock);
1008 ll_inode_size_lock(inode, 1);
1009 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
1010 kms = ldlm_extent_shift_kms(NULL, kms);
1011 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
1012 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
1013 lsm->lsm_oinfo[stripe].loi_kms, kms);
1014 lsm->lsm_oinfo[stripe].loi_kms = kms;
1015 ll_inode_size_unlock(inode, 1);
1016 unlock_res_and_lock(lock);
1021 wake_up(&lock->l_waitq);
1023 ldlm_lock2handle(lock, &lockh);
1024 ldlm_lock_decref(&lockh, LCK_PR);
1029 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
1031 struct ptlrpc_request *req = reqp;
1032 struct inode *inode = ll_inode_from_lock(lock);
1033 struct ll_inode_info *lli;
1034 struct lov_stripe_md *lsm;
1035 struct ost_lvb *lvb;
1037 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
1041 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
1042 lli = ll_i2info(inode);
1044 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1047 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1049 /* First, find out which stripe index this lock corresponds to. */
1050 stripe = ll_lock_to_stripe_offset(inode, lock);
1052 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1054 rc = lustre_pack_reply(req, 2, size, NULL);
1058 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
1059 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
1060 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1061 lvb->lvb_atime = LTIME_S(inode->i_atime);
1062 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1064 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1065 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1066 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1067 lvb->lvb_atime, lvb->lvb_ctime);
1072 /* These errors are normal races, so we don't want to fill the console
1073 * with messages by calling ptlrpc_error() */
1074 if (rc == -ELDLM_NO_LOCK_DATA)
1075 lustre_pack_reply(req, 1, NULL, NULL);
1077 req->rq_status = rc;
1081 static int ll_merge_lvb(struct inode *inode)
1083 struct ll_inode_info *lli = ll_i2info(inode);
1084 struct ll_sb_info *sbi = ll_i2sbi(inode);
1090 ll_inode_size_lock(inode, 1);
1091 inode_init_lvb(inode, &lvb);
1092 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1093 i_size_write(inode, lvb.lvb_size);
1094 inode->i_blocks = lvb.lvb_blocks;
1096 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1097 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1098 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1099 ll_inode_size_unlock(inode, 1);
1104 int ll_local_size(struct inode *inode)
1106 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1107 struct ll_inode_info *lli = ll_i2info(inode);
1108 struct ll_sb_info *sbi = ll_i2sbi(inode);
1109 struct lustre_handle lockh = { 0 };
1114 if (lli->lli_smd->lsm_stripe_count == 0)
1117 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1118 &policy, LCK_PR, &flags, inode, &lockh);
1124 rc = ll_merge_lvb(inode);
1125 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
1129 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1132 struct lustre_handle lockh = { 0 };
1133 struct ldlm_enqueue_info einfo = { 0 };
1134 struct obd_info oinfo = { { { 0 } } };
1140 einfo.ei_type = LDLM_EXTENT;
1141 einfo.ei_mode = LCK_PR;
1142 einfo.ei_cb_bl = ll_extent_lock_callback;
1143 einfo.ei_cb_cp = ldlm_completion_ast;
1144 einfo.ei_cb_gl = ll_glimpse_callback;
1145 einfo.ei_cbdata = NULL;
1147 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1148 oinfo.oi_lockh = &lockh;
1150 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1152 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1156 CERROR("obd_enqueue returned rc %d, "
1157 "returning -EIO\n", rc);
1158 RETURN(rc > 0 ? -EIO : rc);
1161 lov_stripe_lock(lsm);
1162 memset(&lvb, 0, sizeof(lvb));
1163 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1164 st->st_size = lvb.lvb_size;
1165 st->st_blocks = lvb.lvb_blocks;
1166 st->st_mtime = lvb.lvb_mtime;
1167 st->st_atime = lvb.lvb_atime;
1168 st->st_ctime = lvb.lvb_ctime;
1169 lov_stripe_unlock(lsm);
1174 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1175 * file (because it prefers KMS over RSS when larger) */
1176 int ll_glimpse_size(struct inode *inode, int ast_flags)
1178 struct ll_inode_info *lli = ll_i2info(inode);
1179 struct ll_sb_info *sbi = ll_i2sbi(inode);
1180 struct lustre_handle lockh = { 0 };
1181 struct ldlm_enqueue_info einfo = { 0 };
1182 struct obd_info oinfo = { { { 0 } } };
1186 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1189 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1191 if (!lli->lli_smd) {
1192 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1196 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1197 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1198 * won't revoke any conflicting DLM locks held. Instead,
1199 * ll_glimpse_callback() will be called on each client
1200 * holding a DLM lock against this file, and resulting size
1201 * will be returned for each stripe. DLM lock on [0, EOF] is
1202 * acquired only if there were no conflicting locks. */
1203 einfo.ei_type = LDLM_EXTENT;
1204 einfo.ei_mode = LCK_PR;
1205 einfo.ei_cb_bl = ll_extent_lock_callback;
1206 einfo.ei_cb_cp = ldlm_completion_ast;
1207 einfo.ei_cb_gl = ll_glimpse_callback;
1208 einfo.ei_cbdata = inode;
1210 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1211 oinfo.oi_lockh = &lockh;
1212 oinfo.oi_md = lli->lli_smd;
1213 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1215 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1219 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1220 RETURN(rc > 0 ? -EIO : rc);
1223 rc = ll_merge_lvb(inode);
1225 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1226 i_size_read(inode), inode->i_blocks);
1231 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1232 struct lov_stripe_md *lsm, int mode,
1233 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1236 struct ll_sb_info *sbi = ll_i2sbi(inode);
1238 struct ldlm_enqueue_info einfo = { 0 };
1239 struct obd_info oinfo = { { { 0 } } };
1243 LASSERT(!lustre_handle_is_used(lockh));
1244 LASSERT(lsm != NULL);
1246 /* don't drop the mmapped file to LRU */
1247 if (mapping_mapped(inode->i_mapping))
1248 ast_flags |= LDLM_FL_NO_LRU;
1250 /* XXX phil: can we do this? won't it screw the file size up? */
1251 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1252 (sbi->ll_flags & LL_SBI_NOLCK))
1255 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1256 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1258 einfo.ei_type = LDLM_EXTENT;
1259 einfo.ei_mode = mode;
1260 einfo.ei_cb_bl = ll_extent_lock_callback;
1261 einfo.ei_cb_cp = ldlm_completion_ast;
1262 einfo.ei_cb_gl = ll_glimpse_callback;
1263 einfo.ei_cbdata = inode;
1265 oinfo.oi_policy = *policy;
1266 oinfo.oi_lockh = lockh;
1268 oinfo.oi_flags = ast_flags;
1270 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1271 *policy = oinfo.oi_policy;
1275 ll_inode_size_lock(inode, 1);
1276 inode_init_lvb(inode, &lvb);
1277 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1279 if (policy->l_extent.start == 0 &&
1280 policy->l_extent.end == OBD_OBJECT_EOF) {
1281 /* vmtruncate()->ll_truncate() first sets the i_size and then
1282 * the kms under both a DLM lock and the
1283 * ll_inode_size_lock(). If we don't get the
1284 * ll_inode_size_lock() here we can match the DLM lock and
1285 * reset i_size from the kms before the truncating path has
1286 * updated the kms. generic_file_write can then trust the
1287 * stale i_size when doing appending writes and effectively
1288 * cancel the result of the truncate. Getting the
1289 * ll_inode_size_lock() after the enqueue maintains the DLM
1290 * -> ll_inode_size_lock() acquiring order. */
1291 i_size_write(inode, lvb.lvb_size);
1292 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1293 inode->i_ino, i_size_read(inode));
1297 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1298 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1299 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1301 ll_inode_size_unlock(inode, 1);
1306 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1307 struct lov_stripe_md *lsm, int mode,
1308 struct lustre_handle *lockh)
1310 struct ll_sb_info *sbi = ll_i2sbi(inode);
1314 /* XXX phil: can we do this? won't it screw the file size up? */
1315 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1316 (sbi->ll_flags & LL_SBI_NOLCK))
1319 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1324 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1327 struct inode *inode = file->f_dentry->d_inode;
1328 struct ll_inode_info *lli = ll_i2info(inode);
1329 struct lov_stripe_md *lsm = lli->lli_smd;
1330 struct ll_sb_info *sbi = ll_i2sbi(inode);
1331 struct ll_lock_tree tree;
1332 struct ll_lock_tree_node *node;
1334 struct ll_ra_read bead;
1337 ssize_t retval, chunk, sum = 0;
1341 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1342 inode->i_ino, inode->i_generation, inode, count, *ppos);
1343 /* "If nbyte is 0, read() will return 0 and have no other results."
1344 * -- Single Unix Spec */
1348 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1351 /* Read on file with no objects should return zero-filled
1352 * buffers up to file size (we can get non-zero sizes with
1353 * mknod + truncate, then opening file for read. This is a
1354 * common pattern in NFS case, it seems). Bug 6243 */
1356 /* Since there are no objects on OSTs, we have nothing to get
1357 * lock on and so we are forced to access inode->i_size
1360 /* Read beyond end of file */
1361 if (*ppos >= i_size_read(inode))
1364 if (count > i_size_read(inode) - *ppos)
1365 count = i_size_read(inode) - *ppos;
1366 /* Make sure to correctly adjust the file pos pointer for
1368 notzeroed = clear_user(buf, count);
1377 if (sbi->ll_max_rw_chunk != 0) {
1378 /* first, let's know the end of the current stripe */
1380 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1383 /* correct, the end is beyond the request */
1384 if (end > *ppos + count - 1)
1385 end = *ppos + count - 1;
1387 /* and chunk shouldn't be too large even if striping is wide */
1388 if (end - *ppos > sbi->ll_max_rw_chunk)
1389 end = *ppos + sbi->ll_max_rw_chunk - 1;
1391 end = *ppos + count - 1;
1394 node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1396 GOTO(out, retval = PTR_ERR(node));
1399 tree.lt_fd = LUSTRE_FPRIVATE(file);
1400 rc = ll_tree_lock(&tree, node, buf, count,
1401 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1403 GOTO(out, retval = rc);
1405 ll_inode_size_lock(inode, 1);
1407 * Consistency guarantees: following possibilities exist for the
1408 * relation between region being read and real file size at this
1411 * (A): the region is completely inside of the file;
1413 * (B-x): x bytes of region are inside of the file, the rest is
1416 * (C): the region is completely outside of the file.
1418 * This classification is stable under DLM lock acquired by
1419 * ll_tree_lock() above, because to change class, other client has to
1420 * take DLM lock conflicting with our lock. Also, any updates to
1421 * ->i_size by other threads on this client are serialized by
1422 * ll_inode_size_lock(). This guarantees that short reads are handled
1423 * correctly in the face of concurrent writes and truncates.
1425 inode_init_lvb(inode, &lvb);
1426 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1428 if (*ppos + count - 1 > kms) {
1429 /* A glimpse is necessary to determine whether we return a
1430 * short read (B) or some zeroes at the end of the buffer (C) */
1431 ll_inode_size_unlock(inode, 1);
1432 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1434 ll_tree_unlock(&tree);
1438 /* region is within kms and, hence, within real file size (A).
1439 * We need to increase i_size to cover the read region so that
1440 * generic_file_read() will do its job, but that doesn't mean
1441 * the kms size is _correct_, it is only the _minimum_ size.
1442 * If someone does a stat they will get the correct size which
1443 * will always be >= the kms value here. b=11081 */
1444 if (i_size_read(inode) < kms)
1445 i_size_write(inode, kms);
1446 ll_inode_size_unlock(inode, 1);
1449 chunk = end - *ppos + 1;
1450 CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1451 inode->i_ino, chunk, *ppos, i_size_read(inode));
1453 /* turn off the kernel's read-ahead */
1454 file->f_ra.ra_pages = 0;
1456 /* initialize read-ahead window once per syscall */
1459 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1460 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1461 ll_ra_read_in(file, &bead);
1465 file_accessed(file);
1466 retval = generic_file_read(file, buf, chunk, ppos);
1467 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1469 ll_tree_unlock(&tree);
1475 if (retval == chunk && count > 0)
1481 ll_ra_read_ex(file, &bead);
1482 retval = (sum > 0) ? sum : retval;
1487 * Write to a file (through the page cache).
1489 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1492 struct inode *inode = file->f_dentry->d_inode;
1493 struct ll_sb_info *sbi = ll_i2sbi(inode);
1494 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1495 struct ll_lock_tree tree;
1496 struct ll_lock_tree_node *node;
1497 loff_t maxbytes = ll_file_maxbytes(inode);
1498 loff_t lock_start, lock_end, end;
1499 ssize_t retval, chunk, sum = 0;
1503 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1504 inode->i_ino, inode->i_generation, inode, count, *ppos);
1506 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1508 /* POSIX, but surprised the VFS doesn't check this already */
1512 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1513 * called on the file, don't fail the below assertion (bug 2388). */
1514 if (file->f_flags & O_LOV_DELAY_CREATE &&
1515 ll_i2info(inode)->lli_smd == NULL)
1518 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1520 down(&ll_i2info(inode)->lli_write_sem);
1523 chunk = 0; /* just to fix gcc's warning */
1524 end = *ppos + count - 1;
1526 if (file->f_flags & O_APPEND) {
1528 lock_end = OBD_OBJECT_EOF;
1529 } else if (sbi->ll_max_rw_chunk != 0) {
1530 /* first, let's know the end of the current stripe */
1532 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1535 /* correct, the end is beyond the request */
1536 if (end > *ppos + count - 1)
1537 end = *ppos + count - 1;
1539 /* and chunk shouldn't be too large even if striping is wide */
1540 if (end - *ppos > sbi->ll_max_rw_chunk)
1541 end = *ppos + sbi->ll_max_rw_chunk - 1;
1546 lock_end = *ppos + count - 1;
1548 node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1551 GOTO(out, retval = PTR_ERR(node));
1553 tree.lt_fd = LUSTRE_FPRIVATE(file);
1554 rc = ll_tree_lock(&tree, node, buf, count,
1555 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1557 GOTO(out, retval = rc);
1559 /* This is ok, g_f_w will overwrite this under i_sem if it races
1560 * with a local truncate, it just makes our maxbyte checking easier.
1561 * The i_size value gets updated in ll_extent_lock() as a consequence
1562 * of the [0,EOF] extent lock we requested above. */
1563 if (file->f_flags & O_APPEND) {
1564 *ppos = i_size_read(inode);
1565 end = *ppos + count - 1;
1568 if (*ppos >= maxbytes) {
1569 send_sig(SIGXFSZ, current, 0);
1570 GOTO(out_unlock, retval = -EFBIG);
1572 if (*ppos + count > maxbytes)
1573 count = maxbytes - *ppos;
1575 /* generic_file_write handles O_APPEND after getting i_mutex */
1576 chunk = end - *ppos + 1;
1577 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1578 inode->i_ino, chunk, *ppos);
1579 retval = generic_file_write(file, buf, chunk, ppos);
1580 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1583 ll_tree_unlock(&tree);
1590 if (retval == chunk && count > 0)
1594 up(&ll_i2info(inode)->lli_write_sem);
1596 retval = (sum > 0) ? sum : retval;
1597 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1598 retval > 0 ? retval : 0);
1603 * Send file content (through pagecache) somewhere with helper
1605 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1606 read_actor_t actor, void *target)
1608 struct inode *inode = in_file->f_dentry->d_inode;
1609 struct ll_inode_info *lli = ll_i2info(inode);
1610 struct lov_stripe_md *lsm = lli->lli_smd;
1611 struct ll_lock_tree tree;
1612 struct ll_lock_tree_node *node;
1614 struct ll_ra_read bead;
1619 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1620 inode->i_ino, inode->i_generation, inode, count, *ppos);
1622 /* "If nbyte is 0, read() will return 0 and have no other results."
1623 * -- Single Unix Spec */
1627 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1628 /* turn off the kernel's read-ahead */
1629 in_file->f_ra.ra_pages = 0;
1631 /* File with no objects, nothing to lock */
1633 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1635 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1637 RETURN(PTR_ERR(node));
1639 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1640 rc = ll_tree_lock(&tree, node, NULL, count,
1641 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1645 ll_inode_size_lock(inode, 1);
1647 * Consistency guarantees: following possibilities exist for the
1648 * relation between region being read and real file size at this
1651 * (A): the region is completely inside of the file;
1653 * (B-x): x bytes of region are inside of the file, the rest is
1656 * (C): the region is completely outside of the file.
1658 * This classification is stable under DLM lock acquired by
1659 * ll_tree_lock() above, because to change class, other client has to
1660 * take DLM lock conflicting with our lock. Also, any updates to
1661 * ->i_size by other threads on this client are serialized by
1662 * ll_inode_size_lock(). This guarantees that short reads are handled
1663 * correctly in the face of concurrent writes and truncates.
1665 inode_init_lvb(inode, &lvb);
1666 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1668 if (*ppos + count - 1 > kms) {
1669 /* A glimpse is necessary to determine whether we return a
1670 * short read (B) or some zeroes at the end of the buffer (C) */
1671 ll_inode_size_unlock(inode, 1);
1672 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1676 /* region is within kms and, hence, within real file size (A) */
1677 i_size_write(inode, kms);
1678 ll_inode_size_unlock(inode, 1);
1681 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1682 inode->i_ino, count, *ppos, i_size_read(inode));
1684 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1685 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1686 ll_ra_read_in(in_file, &bead);
1688 file_accessed(in_file);
1689 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1690 ll_ra_read_ex(in_file, &bead);
1693 ll_tree_unlock(&tree);
1697 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1700 struct ll_inode_info *lli = ll_i2info(inode);
1701 struct obd_export *exp = ll_i2dtexp(inode);
1702 struct ll_recreate_obj ucreatp;
1703 struct obd_trans_info oti = { 0 };
1704 struct obdo *oa = NULL;
1707 struct lov_stripe_md *lsm, *lsm2;
1710 if (!capable (CAP_SYS_ADMIN))
1713 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1714 sizeof(struct ll_recreate_obj));
1722 down(&lli->lli_size_sem);
1725 GOTO(out, rc = -ENOENT);
1726 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1727 (lsm->lsm_stripe_count));
1729 OBD_ALLOC(lsm2, lsm_size);
1731 GOTO(out, rc = -ENOMEM);
1733 oa->o_id = ucreatp.lrc_id;
1734 oa->o_gr = ucreatp.lrc_group;
1735 oa->o_nlink = ucreatp.lrc_ost_idx;
1736 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1737 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1738 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1739 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1741 memcpy(lsm2, lsm, lsm_size);
1742 rc = obd_create(exp, oa, &lsm2, &oti);
1744 OBD_FREE(lsm2, lsm_size);
1747 up(&lli->lli_size_sem);
1752 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1753 int flags, struct lov_user_md *lum, int lum_size)
1755 struct ll_inode_info *lli = ll_i2info(inode);
1756 struct lov_stripe_md *lsm;
1757 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1761 down(&lli->lli_size_sem);
1764 up(&lli->lli_size_sem);
1765 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1770 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1773 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1774 GOTO(out_req_free, rc = -ENOENT);
1775 rc = oit.d.lustre.it_status;
1777 GOTO(out_req_free, rc);
1779 ll_release_openhandle(file->f_dentry, &oit);
1782 up(&lli->lli_size_sem);
1783 ll_intent_release(&oit);
1786 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1790 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1791 struct lov_mds_md **lmmp, int *lmm_size,
1792 struct ptlrpc_request **request)
1794 struct ll_sb_info *sbi = ll_i2sbi(inode);
1795 struct mdt_body *body;
1796 struct lov_mds_md *lmm = NULL;
1797 struct ptlrpc_request *req = NULL;
1798 struct obd_capa *oc;
1801 rc = ll_get_max_mdsize(sbi, &lmmsize);
1805 oc = ll_mdscapa_get(inode);
1806 rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1807 oc, filename, strlen(filename) + 1,
1808 OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1809 ll_i2suppgid(inode), &req);
1812 CDEBUG(D_INFO, "md_getattr_name failed "
1813 "on %s: rc %d\n", filename, rc);
1817 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body));
1818 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1819 /* swabbed by mdc_getattr_name */
1820 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
1822 lmmsize = body->eadatasize;
1824 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1826 GOTO(out, rc = -ENODATA);
1829 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1830 LASSERT(lmm != NULL);
1831 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
1834 * This is coming from the MDS, so is probably in
1835 * little endian. We convert it to host endian before
1836 * passing it to userspace.
1838 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1839 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1840 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1841 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1842 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1845 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1846 struct lov_stripe_md *lsm;
1847 struct lov_user_md_join *lmj;
1848 int lmj_size, i, aindex = 0;
1850 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1852 GOTO(out, rc = -ENOMEM);
1853 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1855 GOTO(out_free_memmd, rc);
1857 lmj_size = sizeof(struct lov_user_md_join) +
1858 lsm->lsm_stripe_count *
1859 sizeof(struct lov_user_ost_data_join);
1860 OBD_ALLOC(lmj, lmj_size);
1862 GOTO(out_free_memmd, rc = -ENOMEM);
1864 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1865 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1866 struct lov_extent *lex =
1867 &lsm->lsm_array->lai_ext_array[aindex];
1869 if (lex->le_loi_idx + lex->le_stripe_count <= i)
1871 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1872 LPU64" len %d\n", aindex, i,
1873 lex->le_start, (int)lex->le_len);
1874 lmj->lmm_objects[i].l_extent_start =
1877 if ((int)lex->le_len == -1)
1878 lmj->lmm_objects[i].l_extent_end = -1;
1880 lmj->lmm_objects[i].l_extent_end =
1881 lex->le_start + lex->le_len;
1882 lmj->lmm_objects[i].l_object_id =
1883 lsm->lsm_oinfo[i]->loi_id;
1884 lmj->lmm_objects[i].l_object_gr =
1885 lsm->lsm_oinfo[i]->loi_gr;
1886 lmj->lmm_objects[i].l_ost_gen =
1887 lsm->lsm_oinfo[i]->loi_ost_gen;
1888 lmj->lmm_objects[i].l_ost_idx =
1889 lsm->lsm_oinfo[i]->loi_ost_idx;
1891 lmm = (struct lov_mds_md *)lmj;
1894 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1898 *lmm_size = lmmsize;
1903 static int ll_lov_setea(struct inode *inode, struct file *file,
1906 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1907 struct lov_user_md *lump;
1908 int lum_size = sizeof(struct lov_user_md) +
1909 sizeof(struct lov_user_ost_data);
1913 if (!capable (CAP_SYS_ADMIN))
1916 OBD_ALLOC(lump, lum_size);
1920 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
1922 OBD_FREE(lump, lum_size);
1926 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1928 OBD_FREE(lump, lum_size);
1932 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1935 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1937 int flags = FMODE_WRITE;
1940 /* Bug 1152: copy properly when this is no longer true */
1941 LASSERT(sizeof(lum) == sizeof(*lump));
1942 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1943 rc = copy_from_user(&lum, lump, sizeof(lum));
1947 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1949 put_user(0, &lump->lmm_stripe_count);
1950 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1951 0, ll_i2info(inode)->lli_smd, lump);
1956 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1958 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1963 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1967 static int ll_get_grouplock(struct inode *inode, struct file *file,
1970 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1971 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1972 .end = OBD_OBJECT_EOF}};
1973 struct lustre_handle lockh = { 0 };
1974 struct ll_inode_info *lli = ll_i2info(inode);
1975 struct lov_stripe_md *lsm = lli->lli_smd;
1979 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1983 policy.l_extent.gid = arg;
1984 if (file->f_flags & O_NONBLOCK)
1985 flags = LDLM_FL_BLOCK_NOWAIT;
1987 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1991 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1993 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1998 static int ll_put_grouplock(struct inode *inode, struct file *file,
2001 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2002 struct ll_inode_info *lli = ll_i2info(inode);
2003 struct lov_stripe_md *lsm = lli->lli_smd;
2007 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2008 /* Ugh, it's already unlocked. */
2012 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2015 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2017 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2022 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2027 static int join_sanity_check(struct inode *head, struct inode *tail)
2030 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2031 CERROR("server do not support join \n");
2034 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2035 CERROR("tail ino %lu and ino head %lu must be regular\n",
2036 head->i_ino, tail->i_ino);
2039 if (head->i_ino == tail->i_ino) {
2040 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2043 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2044 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2050 static int join_file(struct inode *head_inode, struct file *head_filp,
2051 struct file *tail_filp)
2053 struct dentry *tail_dentry = tail_filp->f_dentry;
2054 struct lookup_intent oit = {.it_op = IT_OPEN,
2055 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2056 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2057 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL };
2059 struct lustre_handle lockh;
2060 struct md_op_data *op_data;
2065 tail_dentry = tail_filp->f_dentry;
2067 data = i_size_read(head_inode);
2068 op_data = ll_prep_md_op_data(NULL, head_inode,
2069 tail_dentry->d_parent->d_inode,
2070 tail_dentry->d_name.name,
2071 tail_dentry->d_name.len, 0,
2072 LUSTRE_OPC_ANY, &data);
2073 if (IS_ERR(op_data))
2074 RETURN(PTR_ERR(op_data));
2076 rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2077 op_data, &lockh, NULL, 0, 0);
2079 ll_finish_md_op_data(op_data);
2083 rc = oit.d.lustre.it_status;
2085 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2086 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2087 ptlrpc_req_finished((struct ptlrpc_request *)
2088 oit.d.lustre.it_data);
2092 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2094 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2095 oit.d.lustre.it_lock_mode = 0;
2097 ll_release_openhandle(head_filp->f_dentry, &oit);
2099 ll_intent_release(&oit);
2103 static int ll_file_join(struct inode *head, struct file *filp,
2104 char *filename_tail)
2106 struct inode *tail = NULL, *first = NULL, *second = NULL;
2107 struct dentry *tail_dentry;
2108 struct file *tail_filp, *first_filp, *second_filp;
2109 struct ll_lock_tree first_tree, second_tree;
2110 struct ll_lock_tree_node *first_node, *second_node;
2111 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2112 int rc = 0, cleanup_phase = 0;
2115 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2116 head->i_ino, head->i_generation, head, filename_tail);
2118 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2119 if (IS_ERR(tail_filp)) {
2120 CERROR("Can not open tail file %s", filename_tail);
2121 rc = PTR_ERR(tail_filp);
2124 tail = igrab(tail_filp->f_dentry->d_inode);
2126 tlli = ll_i2info(tail);
2127 tail_dentry = tail_filp->f_dentry;
2128 LASSERT(tail_dentry);
2131 /*reorder the inode for lock sequence*/
2132 first = head->i_ino > tail->i_ino ? head : tail;
2133 second = head->i_ino > tail->i_ino ? tail : head;
2134 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2135 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2137 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2138 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2139 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2140 if (IS_ERR(first_node)){
2141 rc = PTR_ERR(first_node);
2144 first_tree.lt_fd = first_filp->private_data;
2145 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2150 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2151 if (IS_ERR(second_node)){
2152 rc = PTR_ERR(second_node);
2155 second_tree.lt_fd = second_filp->private_data;
2156 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2161 rc = join_sanity_check(head, tail);
2165 rc = join_file(head, filp, tail_filp);
2169 switch (cleanup_phase) {
2171 ll_tree_unlock(&second_tree);
2172 obd_cancel_unused(ll_i2dtexp(second),
2173 ll_i2info(second)->lli_smd, 0, NULL);
2175 ll_tree_unlock(&first_tree);
2176 obd_cancel_unused(ll_i2dtexp(first),
2177 ll_i2info(first)->lli_smd, 0, NULL);
2179 filp_close(tail_filp, 0);
2182 if (head && rc == 0) {
2183 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2185 hlli->lli_smd = NULL;
2190 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2196 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2198 struct inode *inode = dentry->d_inode;
2199 struct obd_client_handle *och;
2205 /* Root ? Do nothing. */
2206 if (dentry->d_inode->i_sb->s_root == dentry)
2209 /* No open handle to close? Move away */
2210 if (!it_disposition(it, DISP_OPEN_OPEN))
2213 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2215 OBD_ALLOC(och, sizeof(*och));
2217 GOTO(out, rc = -ENOMEM);
2219 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2220 ll_i2info(inode), it, och);
2222 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2225 /* this one is in place of ll_file_open */
2226 ptlrpc_req_finished(it->d.lustre.it_data);
2227 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2231 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2234 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2238 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2239 inode->i_generation, inode, cmd);
2240 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2242 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2243 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2247 case LL_IOC_GETFLAGS:
2248 /* Get the current value of the file flags */
2249 return put_user(fd->fd_flags, (int *)arg);
2250 case LL_IOC_SETFLAGS:
2251 case LL_IOC_CLRFLAGS:
2252 /* Set or clear specific file flags */
2253 /* XXX This probably needs checks to ensure the flags are
2254 * not abused, and to handle any flag side effects.
2256 if (get_user(flags, (int *) arg))
2259 if (cmd == LL_IOC_SETFLAGS) {
2260 if ((flags & LL_FILE_IGNORE_LOCK) &&
2261 !(file->f_flags & O_DIRECT)) {
2262 CERROR("%s: unable to disable locking on "
2263 "non-O_DIRECT file\n", current->comm);
2267 fd->fd_flags |= flags;
2269 fd->fd_flags &= ~flags;
2272 case LL_IOC_LOV_SETSTRIPE:
2273 RETURN(ll_lov_setstripe(inode, file, arg));
2274 case LL_IOC_LOV_SETEA:
2275 RETURN(ll_lov_setea(inode, file, arg));
2276 case LL_IOC_LOV_GETSTRIPE:
2277 RETURN(ll_lov_getstripe(inode, arg));
2278 case LL_IOC_RECREATE_OBJ:
2279 RETURN(ll_lov_recreate_obj(inode, file, arg));
2280 case EXT3_IOC_GETFLAGS:
2281 case EXT3_IOC_SETFLAGS:
2282 RETURN(ll_iocontrol(inode, file, cmd, arg));
2283 case EXT3_IOC_GETVERSION_OLD:
2284 case EXT3_IOC_GETVERSION:
2285 RETURN(put_user(inode->i_generation, (int *)arg));
2290 ftail = getname((const char *)arg);
2292 RETURN(PTR_ERR(ftail));
2293 rc = ll_file_join(inode, file, ftail);
2297 case LL_IOC_GROUP_LOCK:
2298 RETURN(ll_get_grouplock(inode, file, arg));
2299 case LL_IOC_GROUP_UNLOCK:
2300 RETURN(ll_put_grouplock(inode, file, arg));
2301 case IOC_OBD_STATFS:
2302 RETURN(ll_obd_statfs(inode, (void *)arg));
2304 /* We need to special case any other ioctls we want to handle,
2305 * to send them to the MDS/OST as appropriate and to properly
2306 * network encode the arg field.
2307 case EXT3_IOC_SETVERSION_OLD:
2308 case EXT3_IOC_SETVERSION:
2310 case LL_IOC_FLUSHCTX:
2311 RETURN(ll_flush_ctx(inode));
2316 ll_iocontrol_call(inode, file, cmd, arg, &err))
2319 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2325 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2327 struct inode *inode = file->f_dentry->d_inode;
2328 struct ll_inode_info *lli = ll_i2info(inode);
2329 struct lov_stripe_md *lsm = lli->lli_smd;
2332 retval = offset + ((origin == 2) ? i_size_read(inode) :
2333 (origin == 1) ? file->f_pos : 0);
2334 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2335 inode->i_ino, inode->i_generation, inode, retval, retval,
2336 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2337 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2339 if (origin == 2) { /* SEEK_END */
2340 int nonblock = 0, rc;
2342 if (file->f_flags & O_NONBLOCK)
2343 nonblock = LDLM_FL_BLOCK_NOWAIT;
2346 rc = ll_glimpse_size(inode, nonblock);
2351 ll_inode_size_lock(inode, 0);
2352 offset += i_size_read(inode);
2353 ll_inode_size_unlock(inode, 0);
2354 } else if (origin == 1) { /* SEEK_CUR */
2355 offset += file->f_pos;
2359 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2360 if (offset != file->f_pos) {
2361 file->f_pos = offset;
2362 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2364 file->f_version = ++event;
2373 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2375 struct inode *inode = dentry->d_inode;
2376 struct ll_inode_info *lli = ll_i2info(inode);
2377 struct lov_stripe_md *lsm = lli->lli_smd;
2378 struct ptlrpc_request *req;
2379 struct obd_capa *oc;
2382 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2383 inode->i_generation, inode);
2384 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2386 /* fsync's caller has already called _fdata{sync,write}, we want
2387 * that IO to finish before calling the osc and mdc sync methods */
2388 rc = filemap_fdatawait(inode->i_mapping);
2390 /* catch async errors that were recorded back when async writeback
2391 * failed for pages in this mapping. */
2392 err = lli->lli_async_rc;
2393 lli->lli_async_rc = 0;
2397 err = lov_test_and_clear_async_rc(lsm);
2402 oc = ll_mdscapa_get(inode);
2403 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2409 ptlrpc_req_finished(req);
2416 RETURN(rc ? rc : -ENOMEM);
2418 oa->o_id = lsm->lsm_object_id;
2419 oa->o_gr = lsm->lsm_object_gr;
2420 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2421 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2422 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2425 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2426 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2427 0, OBD_OBJECT_EOF, oc);
2437 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2439 struct inode *inode = file->f_dentry->d_inode;
2440 struct ll_sb_info *sbi = ll_i2sbi(inode);
2441 struct ldlm_res_id res_id =
2442 { .name = { fid_seq(ll_inode2fid(inode)),
2443 fid_oid(ll_inode2fid(inode)),
2444 fid_ver(ll_inode2fid(inode)),
2446 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2447 ldlm_flock_completion_ast, NULL, file_lock };
2448 struct lustre_handle lockh = {0};
2449 ldlm_policy_data_t flock;
2454 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2455 inode->i_ino, file_lock);
2457 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2459 if (file_lock->fl_flags & FL_FLOCK) {
2460 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2461 /* set missing params for flock() calls */
2462 file_lock->fl_end = OFFSET_MAX;
2463 file_lock->fl_pid = current->tgid;
2465 flock.l_flock.pid = file_lock->fl_pid;
2466 flock.l_flock.start = file_lock->fl_start;
2467 flock.l_flock.end = file_lock->fl_end;
2469 switch (file_lock->fl_type) {
2471 einfo.ei_mode = LCK_PR;
2474 /* An unlock request may or may not have any relation to
2475 * existing locks so we may not be able to pass a lock handle
2476 * via a normal ldlm_lock_cancel() request. The request may even
2477 * unlock a byte range in the middle of an existing lock. In
2478 * order to process an unlock request we need all of the same
2479 * information that is given with a normal read or write record
2480 * lock request. To avoid creating another ldlm unlock (cancel)
2481 * message we'll treat a LCK_NL flock request as an unlock. */
2482 einfo.ei_mode = LCK_NL;
2485 einfo.ei_mode = LCK_PW;
2488 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2503 flags = LDLM_FL_BLOCK_NOWAIT;
2509 flags = LDLM_FL_TEST_LOCK;
2510 /* Save the old mode so that if the mode in the lock changes we
2511 * can decrement the appropriate reader or writer refcount. */
2512 file_lock->fl_type = einfo.ei_mode;
2515 CERROR("unknown fcntl lock command: %d\n", cmd);
2519 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2520 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2521 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2523 rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &einfo, &res_id,
2524 &flock, &flags, NULL, 0, NULL, &lockh, 0);
2525 if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2526 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2527 #ifdef HAVE_F_OP_FLOCK
2528 if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2529 !(flags & LDLM_FL_TEST_LOCK))
2530 posix_lock_file_wait(file, file_lock);
2536 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2543 int ll_have_md_lock(struct inode *inode, __u64 bits)
2545 struct lustre_handle lockh;
2546 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2554 fid = &ll_i2info(inode)->lli_fid;
2555 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2557 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2558 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2559 LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2565 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2566 struct lustre_handle *lockh)
2568 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2574 fid = &ll_i2info(inode)->lli_fid;
2575 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2577 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2578 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2579 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2583 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2584 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2585 * and return success */
2587 /* This path cannot be hit for regular files unless in
2588 * case of obscure races, so no need to to validate
2590 if (!S_ISREG(inode->i_mode) &&
2591 !S_ISDIR(inode->i_mode))
2596 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2604 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2606 struct inode *inode = dentry->d_inode;
2607 struct ptlrpc_request *req = NULL;
2608 struct ll_sb_info *sbi;
2609 struct obd_export *exp;
2614 CERROR("REPORT THIS LINE TO PETER\n");
2617 sbi = ll_i2sbi(inode);
2619 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2620 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2622 exp = ll_i2mdexp(inode);
2624 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2625 struct lookup_intent oit = { .it_op = IT_GETATTR };
2626 struct md_op_data *op_data;
2628 /* Call getattr by fid, so do not provide name at all. */
2629 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2630 dentry->d_inode, NULL, 0, 0,
2631 LUSTRE_OPC_ANY, NULL);
2632 if (IS_ERR(op_data))
2633 RETURN(PTR_ERR(op_data));
2635 oit.it_flags |= O_CHECK_STALE;
2636 rc = md_intent_lock(exp, op_data, NULL, 0,
2637 /* we are not interested in name
2640 ll_md_blocking_ast, 0);
2641 ll_finish_md_op_data(op_data);
2642 oit.it_flags &= ~O_CHECK_STALE;
2644 rc = ll_inode_revalidate_fini(inode, rc);
2648 rc = ll_revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2650 ll_intent_release(&oit);
2654 /* Unlinked? Unhash dentry, so it is not picked up later by
2655 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2656 here to preserve get_cwd functionality on 2.6.
2658 if (!dentry->d_inode->i_nlink) {
2659 spin_lock(&dcache_lock);
2660 ll_drop_dentry(dentry);
2661 spin_unlock(&dcache_lock);
2664 ll_lookup_finish_locks(&oit, dentry);
2665 } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE)) {
2666 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2667 obd_valid valid = OBD_MD_FLGETATTR;
2668 struct obd_capa *oc;
2671 if (S_ISREG(inode->i_mode)) {
2672 rc = ll_get_max_mdsize(sbi, &ealen);
2675 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2677 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2678 * capa for this inode. Because we only keep capas of dirs
2680 oc = ll_mdscapa_get(inode);
2681 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2685 rc = ll_inode_revalidate_fini(inode, rc);
2689 rc = ll_prep_inode(&inode, req, REPLY_REC_OFF,
2695 /* if object not yet allocated, don't validate size */
2696 if (ll_i2info(inode)->lli_smd == NULL)
2699 /* ll_glimpse_size will prefer locally cached writes if they extend
2701 rc = ll_glimpse_size(inode, 0);
2704 ptlrpc_req_finished(req);
2708 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2709 struct lookup_intent *it, struct kstat *stat)
2711 struct inode *inode = de->d_inode;
2714 res = ll_inode_revalidate_it(de, it);
2715 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2720 stat->dev = inode->i_sb->s_dev;
2721 stat->ino = inode->i_ino;
2722 stat->mode = inode->i_mode;
2723 stat->nlink = inode->i_nlink;
2724 stat->uid = inode->i_uid;
2725 stat->gid = inode->i_gid;
2726 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2727 stat->atime = inode->i_atime;
2728 stat->mtime = inode->i_mtime;
2729 stat->ctime = inode->i_ctime;
2730 #ifdef HAVE_INODE_BLKSIZE
2731 stat->blksize = inode->i_blksize;
2733 stat->blksize = 1 << inode->i_blkbits;
2736 ll_inode_size_lock(inode, 0);
2737 stat->size = i_size_read(inode);
2738 stat->blocks = inode->i_blocks;
2739 ll_inode_size_unlock(inode, 0);
2743 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2745 struct lookup_intent it = { .it_op = IT_GETATTR };
2747 return ll_getattr_it(mnt, de, &it, stat);
2751 int lustre_check_acl(struct inode *inode, int mask)
2753 #ifdef CONFIG_FS_POSIX_ACL
2754 struct ll_inode_info *lli = ll_i2info(inode);
2755 struct posix_acl *acl;
2759 spin_lock(&lli->lli_lock);
2760 acl = posix_acl_dup(lli->lli_posix_acl);
2761 spin_unlock(&lli->lli_lock);
2766 rc = posix_acl_permission(inode, acl, mask);
2767 posix_acl_release(acl);
2775 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2776 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2778 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2779 inode->i_ino, inode->i_generation, inode, mask);
2780 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2781 return lustre_check_remote_perm(inode, mask);
2783 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2784 return generic_permission(inode, mask, lustre_check_acl);
2787 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2789 int mode = inode->i_mode;
2792 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2793 inode->i_ino, inode->i_generation, inode, mask);
2795 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2796 return lustre_check_remote_perm(inode, mask);
2798 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2800 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2801 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2803 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2805 if (current->fsuid == inode->i_uid) {
2808 if (((mode >> 3) & mask & S_IRWXO) != mask)
2810 rc = lustre_check_acl(inode, mask);
2814 goto check_capabilities;
2818 if (in_group_p(inode->i_gid))
2821 if ((mode & mask & S_IRWXO) == mask)
2825 if (!(mask & MAY_EXEC) ||
2826 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2827 if (capable(CAP_DAC_OVERRIDE))
2830 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2831 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2838 /* -o localflock - only provides locally consistent flock locks */
2839 struct file_operations ll_file_operations = {
2840 .read = ll_file_read,
2841 .write = ll_file_write,
2842 .ioctl = ll_file_ioctl,
2843 .open = ll_file_open,
2844 .release = ll_file_release,
2845 .mmap = ll_file_mmap,
2846 .llseek = ll_file_seek,
2847 .sendfile = ll_file_sendfile,
2851 struct file_operations ll_file_operations_flock = {
2852 .read = ll_file_read,
2853 .write = ll_file_write,
2854 .ioctl = ll_file_ioctl,
2855 .open = ll_file_open,
2856 .release = ll_file_release,
2857 .mmap = ll_file_mmap,
2858 .llseek = ll_file_seek,
2859 .sendfile = ll_file_sendfile,
2861 #ifdef HAVE_F_OP_FLOCK
2862 .flock = ll_file_flock,
2864 .lock = ll_file_flock
2867 /* These are for -o noflock - to return ENOSYS on flock calls */
2868 struct file_operations ll_file_operations_noflock = {
2869 .read = ll_file_read,
2870 .write = ll_file_write,
2871 .ioctl = ll_file_ioctl,
2872 .open = ll_file_open,
2873 .release = ll_file_release,
2874 .mmap = ll_file_mmap,
2875 .llseek = ll_file_seek,
2876 .sendfile = ll_file_sendfile,
2878 #ifdef HAVE_F_OP_FLOCK
2879 .flock = ll_file_noflock,
2881 .lock = ll_file_noflock
2884 struct inode_operations ll_file_inode_operations = {
2885 #ifdef HAVE_VFS_INTENT_PATCHES
2886 .setattr_raw = ll_setattr_raw,
2888 .setattr = ll_setattr,
2889 .truncate = ll_truncate,
2890 .getattr = ll_getattr,
2891 .permission = ll_inode_permission,
2892 .setxattr = ll_setxattr,
2893 .getxattr = ll_getxattr,
2894 .listxattr = ll_listxattr,
2895 .removexattr = ll_removexattr,
2898 /* dynamic ioctl number support routins */
2899 static struct llioc_ctl_data {
2900 struct rw_semaphore ioc_sem;
2901 struct list_head ioc_head;
2903 __RWSEM_INITIALIZER(llioc.ioc_sem),
2904 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2909 struct list_head iocd_list;
2910 unsigned int iocd_size;
2911 llioc_callback_t iocd_cb;
2912 unsigned int iocd_count;
2913 unsigned int iocd_cmd[0];
2916 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2919 struct llioc_data *in_data = NULL;
2922 if (cb == NULL || cmd == NULL ||
2923 count > LLIOC_MAX_CMD || count < 0)
2926 size = sizeof(*in_data) + count * sizeof(unsigned int);
2927 OBD_ALLOC(in_data, size);
2928 if (in_data == NULL)
2931 memset(in_data, 0, sizeof(*in_data));
2932 in_data->iocd_size = size;
2933 in_data->iocd_cb = cb;
2934 in_data->iocd_count = count;
2935 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2937 down_write(&llioc.ioc_sem);
2938 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2939 up_write(&llioc.ioc_sem);
2944 void ll_iocontrol_unregister(void *magic)
2946 struct llioc_data *tmp;
2951 down_write(&llioc.ioc_sem);
2952 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2954 unsigned int size = tmp->iocd_size;
2956 list_del(&tmp->iocd_list);
2957 up_write(&llioc.ioc_sem);
2959 OBD_FREE(tmp, size);
2963 up_write(&llioc.ioc_sem);
2965 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2968 EXPORT_SYMBOL(ll_iocontrol_register);
2969 EXPORT_SYMBOL(ll_iocontrol_unregister);
2971 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2972 unsigned int cmd, unsigned long arg, int *rcp)
2974 enum llioc_iter ret = LLIOC_CONT;
2975 struct llioc_data *data;
2976 int rc = -EINVAL, i;
2978 down_read(&llioc.ioc_sem);
2979 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2980 for (i = 0; i < data->iocd_count; i++) {
2981 if (cmd != data->iocd_cmd[i])
2984 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2988 if (ret == LLIOC_STOP)
2991 up_read(&llioc.ioc_sem);