1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <lustre_mdc.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include "llite_internal.h"
33 /* also used by llite/special.c:ll_special_open() */
34 struct ll_file_data *ll_file_data_get(void)
36 struct ll_file_data *fd;
38 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
42 static void ll_file_data_put(struct ll_file_data *fd)
45 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
48 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
49 struct lustre_handle *fh)
51 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
52 op_data->op_attr.ia_mode = inode->i_mode;
53 op_data->op_attr.ia_atime = inode->i_atime;
54 op_data->op_attr.ia_mtime = inode->i_mtime;
55 op_data->op_attr.ia_ctime = inode->i_ctime;
56 op_data->op_attr.ia_size = i_size_read(inode);
57 op_data->op_attr_blocks = inode->i_blocks;
58 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
59 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
60 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
61 op_data->op_capa1 = ll_mdscapa_get(inode);
64 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
65 struct obd_client_handle *och)
69 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
70 ATTR_MTIME_SET | ATTR_CTIME_SET;
72 if (!(och->och_flags & FMODE_WRITE))
75 if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
76 !S_ISREG(inode->i_mode))
77 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
79 ll_epoch_close(inode, op_data, &och, 0);
82 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
86 static int ll_close_inode_openhandle(struct obd_export *md_exp,
88 struct obd_client_handle *och)
90 struct obd_export *exp = ll_i2mdexp(inode);
91 struct md_op_data *op_data;
92 struct ptlrpc_request *req = NULL;
93 struct obd_device *obd = class_exp2obd(exp);
100 * XXX: in case of LMV, is this correct to access
103 CERROR("Invalid MDC connection handle "LPX64"\n",
104 ll_i2mdexp(inode)->exp_handle.h_cookie);
109 * here we check if this is forced umount. If so this is called on
110 * canceling "open lock" and we do not call md_close() in this case, as
111 * it will not be successful, as import is already deactivated.
116 OBD_ALLOC_PTR(op_data);
118 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
120 ll_prepare_close(inode, op_data, och);
121 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
122 rc = md_close(md_exp, op_data, och->och_mod, &req);
127 /* This close must have the epoch closed. */
128 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
129 LASSERT(epoch_close);
130 /* MDS has instructed us to obtain Size-on-MDS attribute from
131 * OSTs and send setattr to back to MDS. */
132 rc = ll_sizeonmds_update(inode, och->och_mod,
133 &och->och_fh, op_data->op_ioepoch);
135 CERROR("inode %lu mdc Size-on-MDS update failed: "
136 "rc = %d\n", inode->i_ino, rc);
140 CERROR("inode %lu mdc close failed: rc = %d\n",
143 ll_finish_md_op_data(op_data);
146 rc = ll_objects_destroy(req, inode);
148 CERROR("inode %lu ll_objects destroy: rc = %d\n",
155 if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
156 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
157 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
160 ptlrpc_close_replay_seq(req);
161 md_clear_open_replay_data(md_exp, och);
162 /* Free @och if it is not waiting for DONE_WRITING. */
163 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
166 if (req) /* This is close request */
167 ptlrpc_req_finished(req);
171 int ll_md_real_close(struct inode *inode, int flags)
173 struct ll_inode_info *lli = ll_i2info(inode);
174 struct obd_client_handle **och_p;
175 struct obd_client_handle *och;
180 if (flags & FMODE_WRITE) {
181 och_p = &lli->lli_mds_write_och;
182 och_usecount = &lli->lli_open_fd_write_count;
183 } else if (flags & FMODE_EXEC) {
184 och_p = &lli->lli_mds_exec_och;
185 och_usecount = &lli->lli_open_fd_exec_count;
187 LASSERT(flags & FMODE_READ);
188 och_p = &lli->lli_mds_read_och;
189 och_usecount = &lli->lli_open_fd_read_count;
192 down(&lli->lli_och_sem);
193 if (*och_usecount) { /* There are still users of this handle, so
195 up(&lli->lli_och_sem);
200 up(&lli->lli_och_sem);
202 if (och) { /* There might be a race and somebody have freed this och
204 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
211 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
214 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
215 struct ll_inode_info *lli = ll_i2info(inode);
219 /* clear group lock, if present */
220 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
221 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
222 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
223 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
227 /* Let's see if we have good enough OPEN lock on the file and if
228 we can skip talking to MDS */
229 if (file->f_dentry->d_inode) { /* Can this ever be false? */
231 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
232 struct lustre_handle lockh;
233 struct inode *inode = file->f_dentry->d_inode;
234 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
236 down(&lli->lli_och_sem);
237 if (fd->fd_omode & FMODE_WRITE) {
239 LASSERT(lli->lli_open_fd_write_count);
240 lli->lli_open_fd_write_count--;
241 } else if (fd->fd_omode & FMODE_EXEC) {
243 LASSERT(lli->lli_open_fd_exec_count);
244 lli->lli_open_fd_exec_count--;
247 LASSERT(lli->lli_open_fd_read_count);
248 lli->lli_open_fd_read_count--;
250 up(&lli->lli_och_sem);
252 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
253 LDLM_IBITS, &policy, lockmode,
255 rc = ll_md_real_close(file->f_dentry->d_inode,
259 CERROR("Releasing a file %p with negative dentry %p. Name %s",
260 file, file->f_dentry, file->f_dentry->d_name.name);
263 LUSTRE_FPRIVATE(file) = NULL;
264 ll_file_data_put(fd);
265 ll_capa_close(inode);
270 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
272 /* While this returns an error code, fput() the caller does not, so we need
273 * to make every effort to clean up all of our state here. Also, applications
274 * rarely check close errors and even if an error is returned they will not
275 * re-try the close call.
277 int ll_file_release(struct inode *inode, struct file *file)
279 struct ll_file_data *fd;
280 struct ll_sb_info *sbi = ll_i2sbi(inode);
281 struct ll_inode_info *lli = ll_i2info(inode);
282 struct lov_stripe_md *lsm = lli->lli_smd;
286 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
287 inode->i_generation, inode);
289 #ifdef CONFIG_FS_POSIX_ACL
290 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
291 inode == inode->i_sb->s_root->d_inode) {
292 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
295 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
296 fd->fd_flags &= ~LL_FILE_RMTACL;
297 rct_del(&sbi->ll_rct, cfs_curproc_pid());
298 et_search_free(&sbi->ll_et, cfs_curproc_pid());
303 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
304 fd = LUSTRE_FPRIVATE(file);
307 /* don't do anything for / */
308 if (inode->i_sb->s_root == file->f_dentry) {
309 LUSTRE_FPRIVATE(file) = NULL;
310 ll_file_data_put(fd);
315 lov_test_and_clear_async_rc(lsm);
316 lli->lli_async_rc = 0;
318 rc = ll_md_close(sbi->ll_md_exp, inode, file);
322 static int ll_intent_file_open(struct file *file, void *lmm,
323 int lmmsize, struct lookup_intent *itp)
325 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
326 struct dentry *parent = file->f_dentry->d_parent;
327 const char *name = file->f_dentry->d_name.name;
328 const int len = file->f_dentry->d_name.len;
329 struct md_op_data *op_data;
330 struct ptlrpc_request *req;
336 /* Usually we come here only for NFSD, and we want open lock.
337 But we can also get here with pre 2.6.15 patchless kernels, and in
338 that case that lock is also ok */
339 /* We can also get here if there was cached open handle in revalidate_it
340 * but it disappeared while we were getting from there to ll_file_open.
341 * But this means this file was closed and immediatelly opened which
342 * makes a good candidate for using OPEN lock */
343 /* If lmmsize & lmm are not 0, we are just setting stripe info
344 * parameters. No need for the open lock */
345 if (!lmm && !lmmsize)
346 itp->it_flags |= MDS_OPEN_LOCK;
348 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
349 file->f_dentry->d_inode, name, len,
350 O_RDWR, LUSTRE_OPC_ANY, NULL);
352 RETURN(PTR_ERR(op_data));
354 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
355 0 /*unused */, &req, ll_md_blocking_ast, 0);
356 ll_finish_md_op_data(op_data);
358 /* reason for keep own exit path - don`t flood log
359 * with messages with -ESTALE errors.
361 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
362 it_open_error(DISP_OPEN_OPEN, itp))
364 ll_release_openhandle(file->f_dentry, itp);
368 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
369 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
370 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
374 if (itp->d.lustre.it_lock_mode)
375 md_set_lock_data(sbi->ll_md_exp,
376 &itp->d.lustre.it_lock_handle,
377 file->f_dentry->d_inode);
379 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
381 ptlrpc_req_finished(itp->d.lustre.it_data);
384 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
385 ll_intent_drop_lock(itp);
390 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
391 struct lookup_intent *it, struct obd_client_handle *och)
393 struct ptlrpc_request *req = it->d.lustre.it_data;
394 struct mdt_body *body;
398 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
399 LASSERT(body != NULL); /* reply already checked out */
401 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
402 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
403 och->och_fid = lli->lli_fid;
404 och->och_flags = it->it_flags;
405 lli->lli_ioepoch = body->ioepoch;
407 return md_set_open_replay_data(md_exp, och, req);
410 int ll_local_open(struct file *file, struct lookup_intent *it,
411 struct ll_file_data *fd, struct obd_client_handle *och)
413 struct inode *inode = file->f_dentry->d_inode;
414 struct ll_inode_info *lli = ll_i2info(inode);
417 LASSERT(!LUSTRE_FPRIVATE(file));
422 struct ptlrpc_request *req = it->d.lustre.it_data;
423 struct mdt_body *body;
426 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
430 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
431 if ((it->it_flags & FMODE_WRITE) &&
432 (body->valid & OBD_MD_FLSIZE))
433 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
434 lli->lli_ioepoch, PFID(&lli->lli_fid));
437 LUSTRE_FPRIVATE(file) = fd;
438 ll_readahead_init(inode, &fd->fd_ras);
439 fd->fd_omode = it->it_flags;
443 /* Open a file, and (for the very first open) create objects on the OSTs at
444 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
445 * creation or open until ll_lov_setstripe() ioctl is called. We grab
446 * lli_open_sem to ensure no other process will create objects, send the
447 * stripe MD to the MDS, or try to destroy the objects if that fails.
449 * If we already have the stripe MD locally then we don't request it in
450 * md_open(), by passing a lmm_size = 0.
452 * It is up to the application to ensure no other processes open this file
453 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
454 * used. We might be able to avoid races of that sort by getting lli_open_sem
455 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
456 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
458 int ll_file_open(struct inode *inode, struct file *file)
460 struct ll_inode_info *lli = ll_i2info(inode);
461 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
462 .it_flags = file->f_flags };
463 struct lov_stripe_md *lsm;
464 struct ptlrpc_request *req = NULL;
465 struct obd_client_handle **och_p;
467 struct ll_file_data *fd;
471 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
472 inode->i_generation, inode, file->f_flags);
474 #ifdef HAVE_VFS_INTENT_PATCHES
477 it = file->private_data; /* XXX: compat macro */
478 file->private_data = NULL; /* prevent ll_local_open assertion */
481 fd = ll_file_data_get();
485 /* don't do anything for / */
486 if (inode->i_sb->s_root == file->f_dentry) {
487 LUSTRE_FPRIVATE(file) = fd;
491 if (!it || !it->d.lustre.it_disposition) {
492 /* Convert f_flags into access mode. We cannot use file->f_mode,
493 * because everything but O_ACCMODE mask was stripped from
495 if ((oit.it_flags + 1) & O_ACCMODE)
497 if (file->f_flags & O_TRUNC)
498 oit.it_flags |= FMODE_WRITE;
500 /* kernel only call f_op->open in dentry_open. filp_open calls
501 * dentry_open after call to open_namei that checks permissions.
502 * Only nfsd_open call dentry_open directly without checking
503 * permissions and because of that this code below is safe. */
504 if (oit.it_flags & FMODE_WRITE)
505 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
507 /* We do not want O_EXCL here, presumably we opened the file
508 * already? XXX - NFS implications? */
509 oit.it_flags &= ~O_EXCL;
515 /* Let's see if we have file open on MDS already. */
516 if (it->it_flags & FMODE_WRITE) {
517 och_p = &lli->lli_mds_write_och;
518 och_usecount = &lli->lli_open_fd_write_count;
519 } else if (it->it_flags & FMODE_EXEC) {
520 och_p = &lli->lli_mds_exec_och;
521 och_usecount = &lli->lli_open_fd_exec_count;
523 och_p = &lli->lli_mds_read_och;
524 och_usecount = &lli->lli_open_fd_read_count;
527 down(&lli->lli_och_sem);
528 if (*och_p) { /* Open handle is present */
529 if (it_disposition(it, DISP_OPEN_OPEN)) {
530 /* Well, there's extra open request that we do not need,
531 let's close it somehow. This will decref request. */
532 rc = it_open_error(DISP_OPEN_OPEN, it);
534 ll_file_data_put(fd);
535 GOTO(out_och_free, rc);
537 ll_release_openhandle(file->f_dentry, it);
538 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
543 rc = ll_local_open(file, it, fd, NULL);
545 up(&lli->lli_och_sem);
546 ll_file_data_put(fd);
550 LASSERT(*och_usecount == 0);
551 if (!it->d.lustre.it_disposition) {
552 /* We cannot just request lock handle now, new ELC code
553 means that one of other OPEN locks for this file
554 could be cancelled, and since blocking ast handler
555 would attempt to grab och_sem as well, that would
556 result in a deadlock */
557 up(&lli->lli_och_sem);
558 it->it_flags |= O_CHECK_STALE;
559 rc = ll_intent_file_open(file, NULL, 0, it);
560 it->it_flags &= ~O_CHECK_STALE;
562 ll_file_data_put(fd);
563 GOTO(out_openerr, rc);
566 /* Got some error? Release the request */
567 if (it->d.lustre.it_status < 0) {
568 req = it->d.lustre.it_data;
569 ptlrpc_req_finished(req);
571 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
572 &it->d.lustre.it_lock_handle,
573 file->f_dentry->d_inode);
576 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
578 ll_file_data_put(fd);
579 GOTO(out_och_free, rc = -ENOMEM);
582 req = it->d.lustre.it_data;
584 /* md_intent_lock() didn't get a request ref if there was an
585 * open error, so don't do cleanup on the request here
587 /* XXX (green): Should not we bail out on any error here, not
588 * just open error? */
589 rc = it_open_error(DISP_OPEN_OPEN, it);
591 ll_file_data_put(fd);
592 GOTO(out_och_free, rc);
595 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
596 rc = ll_local_open(file, it, fd, *och_p);
598 up(&lli->lli_och_sem);
599 ll_file_data_put(fd);
600 GOTO(out_och_free, rc);
603 up(&lli->lli_och_sem);
605 /* Must do this outside lli_och_sem lock to prevent deadlock where
606 different kind of OPEN lock for this same inode gets cancelled
607 by ldlm_cancel_lru */
608 if (!S_ISREG(inode->i_mode))
615 if (file->f_flags & O_LOV_DELAY_CREATE ||
616 !(file->f_mode & FMODE_WRITE)) {
617 CDEBUG(D_INODE, "object creation was delayed\n");
621 file->f_flags &= ~O_LOV_DELAY_CREATE;
624 ptlrpc_req_finished(req);
626 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
630 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
631 *och_p = NULL; /* OBD_FREE writes some magic there */
634 up(&lli->lli_och_sem);
635 out_openerr: ;/* Looks weierd, eh? Just wait for statahead code to insert
636 a statement here <-- remove this comment after statahead
643 /* Fills the obdo with the attributes for the inode defined by lsm */
644 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
646 struct ptlrpc_request_set *set;
647 struct ll_inode_info *lli = ll_i2info(inode);
648 struct lov_stripe_md *lsm = lli->lli_smd;
650 struct obd_info oinfo = { { { 0 } } };
654 LASSERT(lsm != NULL);
658 oinfo.oi_oa->o_id = lsm->lsm_object_id;
659 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
660 oinfo.oi_oa->o_mode = S_IFREG;
661 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
662 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
663 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
664 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
666 oinfo.oi_capa = ll_mdscapa_get(inode);
668 set = ptlrpc_prep_set();
670 CERROR("can't allocate ptlrpc set\n");
673 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
675 rc = ptlrpc_set_wait(set);
676 ptlrpc_set_destroy(set);
678 capa_put(oinfo.oi_capa);
682 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
683 OBD_MD_FLATIME | OBD_MD_FLMTIME |
684 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
686 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
687 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
688 lli->lli_smd->lsm_object_id, i_size_read(inode),
689 (unsigned long long)inode->i_blocks, ll_inode_blksize(inode));
693 static inline void ll_remove_suid(struct inode *inode)
697 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
698 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
700 /* was any of the uid bits set? */
701 mode &= inode->i_mode;
702 if (mode && !capable(CAP_FSETID)) {
703 inode->i_mode &= ~mode;
704 // XXX careful here - we cannot change the size
708 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
710 struct ll_inode_info *lli = ll_i2info(inode);
711 struct lov_stripe_md *lsm = lli->lli_smd;
712 struct obd_export *exp = ll_i2dtexp(inode);
715 struct ldlm_lock *lock;
716 struct lov_stripe_md *lsm;
717 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
718 __u32 stripe, vallen = sizeof(stripe);
722 if (lsm->lsm_stripe_count == 1)
723 GOTO(check, stripe = 0);
725 /* get our offset in the lov */
726 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
728 CERROR("obd_get_info: rc = %d\n", rc);
731 LASSERT(stripe < lsm->lsm_stripe_count);
734 if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
735 lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
736 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
737 lsm->lsm_oinfo[stripe]->loi_id,
738 lsm->lsm_oinfo[stripe]->loi_gr);
739 RETURN(-ELDLM_NO_LOCK_DATA);
745 /* Flush the page cache for an extent as its canceled. When we're on an LOV,
746 * we get a lock cancellation for each stripe, so we have to map the obd's
747 * region back onto the stripes in the file that it held.
749 * No one can dirty the extent until we've finished our work and they can
750 * enqueue another lock. The DLM protects us from ll_file_read/write here,
751 * but other kernel actors could have pages locked.
753 * Called with the DLM lock held. */
754 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
755 struct ldlm_lock *lock, __u32 stripe)
757 ldlm_policy_data_t tmpex;
758 unsigned long start, end, count, skip, i, j;
760 int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
761 struct lustre_handle lockh;
762 struct address_space *mapping = inode->i_mapping;
765 tmpex = lock->l_policy_data;
766 CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
767 inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
770 /* our locks are page granular thanks to osc_enqueue, we invalidate the
772 if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
773 ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
774 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",
776 LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
777 LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
781 start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
782 end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
783 if (lsm->lsm_stripe_count > 1) {
784 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
785 skip = (lsm->lsm_stripe_count - 1) * count;
786 start += start/count * skip + stripe * count;
788 end += end/count * skip + stripe * count;
790 if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
793 i = i_size_read(inode) ? (__u64)(i_size_read(inode) - 1) >>
798 CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
799 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
800 count, skip, end, discard ? " (DISCARDING)" : "");
802 /* walk through the vmas on the inode and tear down mmaped pages that
803 * intersect with the lock. this stops immediately if there are no
804 * mmap()ed regions of the file. This is not efficient at all and
805 * should be short lived. We'll associate mmap()ed pages with the lock
806 * and will be able to find them directly */
807 for (i = start; i <= end; i += (j + skip)) {
808 j = min(count - (i % count), end - i + 1);
811 if (ll_teardown_mmaps(mapping,
812 (__u64)i << CFS_PAGE_SHIFT,
813 ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
817 /* this is the simplistic implementation of page eviction at
818 * cancelation. It is careful to get races with other page
819 * lockers handled correctly. fixes from bug 20 will make it
820 * more efficient by associating locks with pages and with
821 * batching writeback under the lock explicitly. */
822 for (i = start, j = start % count; i <= end;
823 j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
825 CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
831 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
832 LPU64" >= "LPU64" start %lu i %lu end %lu\n",
833 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
836 if (!mapping_has_pages(mapping)) {
837 CDEBUG(D_INODE|D_PAGE, "nothing left\n");
843 page = find_get_page(mapping, i);
846 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
847 i, tmpex.l_extent.start);
850 /* page->mapping to check with racing against teardown */
851 if (!discard && clear_page_dirty_for_io(page)) {
852 rc = ll_call_writepage(inode, page);
853 /* either waiting for io to complete or reacquiring
854 * the lock that the failed writepage released */
856 wait_on_page_writeback(page);
858 CERROR("writepage inode %lu(%p) of page %p "
859 "failed: %d\n", inode->i_ino, inode,
862 set_bit(AS_ENOSPC, &mapping->flags);
864 set_bit(AS_EIO, &mapping->flags);
868 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
869 /* check to see if another DLM lock covers this page b=2765 */
870 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
871 LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
873 &lock->l_resource->lr_name, LDLM_EXTENT,
874 &tmpex, LCK_PR | LCK_PW, &lockh);
876 if (rc2 <= 0 && page->mapping != NULL) {
877 struct ll_async_page *llap = llap_cast_private(page);
878 /* checking again to account for writeback's
880 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
882 ll_ra_accounting(llap, mapping);
883 ll_truncate_complete_page(page);
886 page_cache_release(page);
888 LASSERTF(tmpex.l_extent.start <=
889 (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
890 lock->l_policy_data.l_extent.end + 1),
891 "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
892 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
897 static int ll_extent_lock_callback(struct ldlm_lock *lock,
898 struct ldlm_lock_desc *new, void *data,
901 struct lustre_handle lockh = { 0 };
905 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
906 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
911 case LDLM_CB_BLOCKING:
912 ldlm_lock2handle(lock, &lockh);
913 rc = ldlm_cli_cancel(&lockh);
915 CERROR("ldlm_cli_cancel failed: %d\n", rc);
917 case LDLM_CB_CANCELING: {
919 struct ll_inode_info *lli;
920 struct lov_stripe_md *lsm;
924 /* This lock wasn't granted, don't try to evict pages */
925 if (lock->l_req_mode != lock->l_granted_mode)
928 inode = ll_inode_from_lock(lock);
931 lli = ll_i2info(inode);
934 if (lli->lli_smd == NULL)
938 stripe = ll_lock_to_stripe_offset(inode, lock);
942 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
944 lov_stripe_lock(lsm);
945 lock_res_and_lock(lock);
946 kms = ldlm_extent_shift_kms(lock,
947 lsm->lsm_oinfo[stripe]->loi_kms);
949 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
950 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
951 lsm->lsm_oinfo[stripe]->loi_kms, kms);
952 lsm->lsm_oinfo[stripe]->loi_kms = kms;
953 unlock_res_and_lock(lock);
954 lov_stripe_unlock(lsm);
967 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
969 /* XXX ALLOCATE - 160 bytes */
970 struct inode *inode = ll_inode_from_lock(lock);
971 struct ll_inode_info *lli = ll_i2info(inode);
972 struct lustre_handle lockh = { 0 };
977 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
978 LDLM_FL_BLOCK_CONV)) {
979 LBUG(); /* not expecting any blocked async locks yet */
980 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
982 ldlm_lock_dump(D_OTHER, lock, 0);
983 ldlm_reprocess_all(lock->l_resource);
987 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
989 stripe = ll_lock_to_stripe_offset(inode, lock);
993 if (lock->l_lvb_len) {
994 struct lov_stripe_md *lsm = lli->lli_smd;
996 lvb = lock->l_lvb_data;
997 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
999 lock_res_and_lock(lock);
1000 ll_inode_size_lock(inode, 1);
1001 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
1002 kms = ldlm_extent_shift_kms(NULL, kms);
1003 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
1004 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
1005 lsm->lsm_oinfo[stripe].loi_kms, kms);
1006 lsm->lsm_oinfo[stripe].loi_kms = kms;
1007 ll_inode_size_unlock(inode, 1);
1008 unlock_res_and_lock(lock);
1013 wake_up(&lock->l_waitq);
1015 ldlm_lock2handle(lock, &lockh);
1016 ldlm_lock_decref(&lockh, LCK_PR);
1021 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
1023 struct ptlrpc_request *req = reqp;
1024 struct inode *inode = ll_inode_from_lock(lock);
1025 struct ll_inode_info *lli;
1026 struct lov_stripe_md *lsm;
1027 struct ost_lvb *lvb;
1032 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
1033 lli = ll_i2info(inode);
1035 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1038 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1040 /* First, find out which stripe index this lock corresponds to. */
1041 stripe = ll_lock_to_stripe_offset(inode, lock);
1043 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1045 req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
1046 req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
1048 rc = req_capsule_server_pack(&req->rq_pill);
1050 CERROR("lustre_pack_reply: %d\n", rc);
1054 lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
1055 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
1056 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1057 lvb->lvb_atime = LTIME_S(inode->i_atime);
1058 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1060 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1061 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1062 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1063 lvb->lvb_atime, lvb->lvb_ctime);
1068 /* These errors are normal races, so we don't want to fill the console
1069 * with messages by calling ptlrpc_error() */
1070 if (rc == -ELDLM_NO_LOCK_DATA)
1071 lustre_pack_reply(req, 1, NULL, NULL);
1073 req->rq_status = rc;
1077 static int ll_merge_lvb(struct inode *inode)
1079 struct ll_inode_info *lli = ll_i2info(inode);
1080 struct ll_sb_info *sbi = ll_i2sbi(inode);
1086 ll_inode_size_lock(inode, 1);
1087 inode_init_lvb(inode, &lvb);
1088 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1089 i_size_write(inode, lvb.lvb_size);
1090 inode->i_blocks = lvb.lvb_blocks;
1092 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1093 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1094 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1095 ll_inode_size_unlock(inode, 1);
1100 int ll_local_size(struct inode *inode)
1102 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1103 struct ll_inode_info *lli = ll_i2info(inode);
1104 struct ll_sb_info *sbi = ll_i2sbi(inode);
1105 struct lustre_handle lockh = { 0 };
1110 if (lli->lli_smd->lsm_stripe_count == 0)
1113 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1114 &policy, LCK_PR, &flags, inode, &lockh);
1120 rc = ll_merge_lvb(inode);
1121 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
1125 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1128 struct lustre_handle lockh = { 0 };
1129 struct ldlm_enqueue_info einfo = { 0 };
1130 struct obd_info oinfo = { { { 0 } } };
1136 einfo.ei_type = LDLM_EXTENT;
1137 einfo.ei_mode = LCK_PR;
1138 einfo.ei_cb_bl = ll_extent_lock_callback;
1139 einfo.ei_cb_cp = ldlm_completion_ast;
1140 einfo.ei_cb_gl = ll_glimpse_callback;
1141 einfo.ei_cbdata = NULL;
1143 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1144 oinfo.oi_lockh = &lockh;
1146 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1148 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1152 CERROR("obd_enqueue returned rc %d, "
1153 "returning -EIO\n", rc);
1154 RETURN(rc > 0 ? -EIO : rc);
1157 lov_stripe_lock(lsm);
1158 memset(&lvb, 0, sizeof(lvb));
1159 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1160 st->st_size = lvb.lvb_size;
1161 st->st_blocks = lvb.lvb_blocks;
1162 st->st_mtime = lvb.lvb_mtime;
1163 st->st_atime = lvb.lvb_atime;
1164 st->st_ctime = lvb.lvb_ctime;
1165 lov_stripe_unlock(lsm);
1170 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1171 * file (because it prefers KMS over RSS when larger) */
1172 int ll_glimpse_size(struct inode *inode, int ast_flags)
1174 struct ll_inode_info *lli = ll_i2info(inode);
1175 struct ll_sb_info *sbi = ll_i2sbi(inode);
1176 struct lustre_handle lockh = { 0 };
1177 struct ldlm_enqueue_info einfo = { 0 };
1178 struct obd_info oinfo = { { { 0 } } };
1182 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1185 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1187 if (!lli->lli_smd) {
1188 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1192 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1193 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1194 * won't revoke any conflicting DLM locks held. Instead,
1195 * ll_glimpse_callback() will be called on each client
1196 * holding a DLM lock against this file, and resulting size
1197 * will be returned for each stripe. DLM lock on [0, EOF] is
1198 * acquired only if there were no conflicting locks. */
1199 einfo.ei_type = LDLM_EXTENT;
1200 einfo.ei_mode = LCK_PR;
1201 einfo.ei_cb_bl = ll_extent_lock_callback;
1202 einfo.ei_cb_cp = ldlm_completion_ast;
1203 einfo.ei_cb_gl = ll_glimpse_callback;
1204 einfo.ei_cbdata = inode;
1206 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1207 oinfo.oi_lockh = &lockh;
1208 oinfo.oi_md = lli->lli_smd;
1209 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1211 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1215 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1216 RETURN(rc > 0 ? -EIO : rc);
1219 rc = ll_merge_lvb(inode);
1221 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1222 i_size_read(inode), (unsigned long long)inode->i_blocks);
1227 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1228 struct lov_stripe_md *lsm, int mode,
1229 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1232 struct ll_sb_info *sbi = ll_i2sbi(inode);
1234 struct ldlm_enqueue_info einfo = { 0 };
1235 struct obd_info oinfo = { { { 0 } } };
1239 LASSERT(!lustre_handle_is_used(lockh));
1240 LASSERT(lsm != NULL);
1242 /* don't drop the mmapped file to LRU */
1243 if (mapping_mapped(inode->i_mapping))
1244 ast_flags |= LDLM_FL_NO_LRU;
1246 /* XXX phil: can we do this? won't it screw the file size up? */
1247 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1248 (sbi->ll_flags & LL_SBI_NOLCK))
1251 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1252 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1254 einfo.ei_type = LDLM_EXTENT;
1255 einfo.ei_mode = mode;
1256 einfo.ei_cb_bl = ll_extent_lock_callback;
1257 einfo.ei_cb_cp = ldlm_completion_ast;
1258 einfo.ei_cb_gl = ll_glimpse_callback;
1259 einfo.ei_cbdata = inode;
1261 oinfo.oi_policy = *policy;
1262 oinfo.oi_lockh = lockh;
1264 oinfo.oi_flags = ast_flags;
1266 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1267 *policy = oinfo.oi_policy;
1271 ll_inode_size_lock(inode, 1);
1272 inode_init_lvb(inode, &lvb);
1273 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1275 if (policy->l_extent.start == 0 &&
1276 policy->l_extent.end == OBD_OBJECT_EOF) {
1277 /* vmtruncate()->ll_truncate() first sets the i_size and then
1278 * the kms under both a DLM lock and the
1279 * ll_inode_size_lock(). If we don't get the
1280 * ll_inode_size_lock() here we can match the DLM lock and
1281 * reset i_size from the kms before the truncating path has
1282 * updated the kms. generic_file_write can then trust the
1283 * stale i_size when doing appending writes and effectively
1284 * cancel the result of the truncate. Getting the
1285 * ll_inode_size_lock() after the enqueue maintains the DLM
1286 * -> ll_inode_size_lock() acquiring order. */
1287 i_size_write(inode, lvb.lvb_size);
1288 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1289 inode->i_ino, i_size_read(inode));
1293 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1294 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1295 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1297 ll_inode_size_unlock(inode, 1);
1302 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1303 struct lov_stripe_md *lsm, int mode,
1304 struct lustre_handle *lockh)
1306 struct ll_sb_info *sbi = ll_i2sbi(inode);
1310 /* XXX phil: can we do this? won't it screw the file size up? */
1311 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1312 (sbi->ll_flags & LL_SBI_NOLCK))
1315 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1320 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1323 struct inode *inode = file->f_dentry->d_inode;
1324 struct ll_inode_info *lli = ll_i2info(inode);
1325 struct lov_stripe_md *lsm = lli->lli_smd;
1326 struct ll_sb_info *sbi = ll_i2sbi(inode);
1327 struct ll_lock_tree tree;
1328 struct ll_lock_tree_node *node;
1330 struct ll_ra_read bead;
1333 ssize_t retval, chunk, sum = 0;
1337 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1338 inode->i_ino, inode->i_generation, inode, count, *ppos);
1339 /* "If nbyte is 0, read() will return 0 and have no other results."
1340 * -- Single Unix Spec */
1344 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1347 /* Read on file with no objects should return zero-filled
1348 * buffers up to file size (we can get non-zero sizes with
1349 * mknod + truncate, then opening file for read. This is a
1350 * common pattern in NFS case, it seems). Bug 6243 */
1352 /* Since there are no objects on OSTs, we have nothing to get
1353 * lock on and so we are forced to access inode->i_size
1356 /* Read beyond end of file */
1357 if (*ppos >= i_size_read(inode))
1360 if (count > i_size_read(inode) - *ppos)
1361 count = i_size_read(inode) - *ppos;
1362 /* Make sure to correctly adjust the file pos pointer for
1364 notzeroed = clear_user(buf, count);
1373 if (sbi->ll_max_rw_chunk != 0) {
1374 /* first, let's know the end of the current stripe */
1376 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1379 /* correct, the end is beyond the request */
1380 if (end > *ppos + count - 1)
1381 end = *ppos + count - 1;
1383 /* and chunk shouldn't be too large even if striping is wide */
1384 if (end - *ppos > sbi->ll_max_rw_chunk)
1385 end = *ppos + sbi->ll_max_rw_chunk - 1;
1387 end = *ppos + count - 1;
1390 node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1392 GOTO(out, retval = PTR_ERR(node));
1395 tree.lt_fd = LUSTRE_FPRIVATE(file);
1396 rc = ll_tree_lock(&tree, node, buf, count,
1397 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1399 GOTO(out, retval = rc);
1401 ll_inode_size_lock(inode, 1);
1403 * Consistency guarantees: following possibilities exist for the
1404 * relation between region being read and real file size at this
1407 * (A): the region is completely inside of the file;
1409 * (B-x): x bytes of region are inside of the file, the rest is
1412 * (C): the region is completely outside of the file.
1414 * This classification is stable under DLM lock acquired by
1415 * ll_tree_lock() above, because to change class, other client has to
1416 * take DLM lock conflicting with our lock. Also, any updates to
1417 * ->i_size by other threads on this client are serialized by
1418 * ll_inode_size_lock(). This guarantees that short reads are handled
1419 * correctly in the face of concurrent writes and truncates.
1421 inode_init_lvb(inode, &lvb);
1422 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1424 if (*ppos + count - 1 > kms) {
1425 /* A glimpse is necessary to determine whether we return a
1426 * short read (B) or some zeroes at the end of the buffer (C) */
1427 ll_inode_size_unlock(inode, 1);
1428 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1430 ll_tree_unlock(&tree);
1434 /* region is within kms and, hence, within real file size (A).
1435 * We need to increase i_size to cover the read region so that
1436 * generic_file_read() will do its job, but that doesn't mean
1437 * the kms size is _correct_, it is only the _minimum_ size.
1438 * If someone does a stat they will get the correct size which
1439 * will always be >= the kms value here. b=11081 */
1440 if (i_size_read(inode) < kms)
1441 i_size_write(inode, kms);
1442 ll_inode_size_unlock(inode, 1);
1445 chunk = end - *ppos + 1;
1446 CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1447 inode->i_ino, chunk, *ppos, i_size_read(inode));
1449 /* turn off the kernel's read-ahead */
1450 file->f_ra.ra_pages = 0;
1452 /* initialize read-ahead window once per syscall */
1455 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1456 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1457 ll_ra_read_in(file, &bead);
1461 file_accessed(file);
1462 retval = generic_file_read(file, buf, chunk, ppos);
1463 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1465 ll_tree_unlock(&tree);
1471 if (retval == chunk && count > 0)
1477 ll_ra_read_ex(file, &bead);
1478 retval = (sum > 0) ? sum : retval;
1483 * Write to a file (through the page cache).
1485 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1488 struct inode *inode = file->f_dentry->d_inode;
1489 struct ll_sb_info *sbi = ll_i2sbi(inode);
1490 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1491 struct ll_lock_tree tree;
1492 struct ll_lock_tree_node *node;
1493 loff_t maxbytes = ll_file_maxbytes(inode);
1494 loff_t lock_start, lock_end, end;
1495 ssize_t retval, chunk, sum = 0;
1499 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1500 inode->i_ino, inode->i_generation, inode, count, *ppos);
1502 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1504 /* POSIX, but surprised the VFS doesn't check this already */
1508 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1509 * called on the file, don't fail the below assertion (bug 2388). */
1510 if (file->f_flags & O_LOV_DELAY_CREATE &&
1511 ll_i2info(inode)->lli_smd == NULL)
1514 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1516 down(&ll_i2info(inode)->lli_write_sem);
1519 chunk = 0; /* just to fix gcc's warning */
1520 end = *ppos + count - 1;
1522 if (file->f_flags & O_APPEND) {
1524 lock_end = OBD_OBJECT_EOF;
1525 } else if (sbi->ll_max_rw_chunk != 0) {
1526 /* first, let's know the end of the current stripe */
1528 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1531 /* correct, the end is beyond the request */
1532 if (end > *ppos + count - 1)
1533 end = *ppos + count - 1;
1535 /* and chunk shouldn't be too large even if striping is wide */
1536 if (end - *ppos > sbi->ll_max_rw_chunk)
1537 end = *ppos + sbi->ll_max_rw_chunk - 1;
1542 lock_end = *ppos + count - 1;
1544 node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1547 GOTO(out, retval = PTR_ERR(node));
1549 tree.lt_fd = LUSTRE_FPRIVATE(file);
1550 rc = ll_tree_lock(&tree, node, buf, count,
1551 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1553 GOTO(out, retval = rc);
1555 /* This is ok, g_f_w will overwrite this under i_sem if it races
1556 * with a local truncate, it just makes our maxbyte checking easier.
1557 * The i_size value gets updated in ll_extent_lock() as a consequence
1558 * of the [0,EOF] extent lock we requested above. */
1559 if (file->f_flags & O_APPEND) {
1560 *ppos = i_size_read(inode);
1561 end = *ppos + count - 1;
1564 if (*ppos >= maxbytes) {
1565 send_sig(SIGXFSZ, current, 0);
1566 GOTO(out_unlock, retval = -EFBIG);
1568 if (*ppos + count > maxbytes)
1569 count = maxbytes - *ppos;
1571 /* generic_file_write handles O_APPEND after getting i_mutex */
1572 chunk = end - *ppos + 1;
1573 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1574 inode->i_ino, chunk, *ppos);
1575 retval = generic_file_write(file, buf, chunk, ppos);
1576 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1579 ll_tree_unlock(&tree);
1586 if (retval == chunk && count > 0)
1590 up(&ll_i2info(inode)->lli_write_sem);
1592 retval = (sum > 0) ? sum : retval;
1593 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1594 retval > 0 ? retval : 0);
1599 * Send file content (through pagecache) somewhere with helper
1601 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1602 read_actor_t actor, void *target)
1604 struct inode *inode = in_file->f_dentry->d_inode;
1605 struct ll_inode_info *lli = ll_i2info(inode);
1606 struct lov_stripe_md *lsm = lli->lli_smd;
1607 struct ll_lock_tree tree;
1608 struct ll_lock_tree_node *node;
1610 struct ll_ra_read bead;
1615 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1616 inode->i_ino, inode->i_generation, inode, count, *ppos);
1618 /* "If nbyte is 0, read() will return 0 and have no other results."
1619 * -- Single Unix Spec */
1623 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1624 /* turn off the kernel's read-ahead */
1625 in_file->f_ra.ra_pages = 0;
1627 /* File with no objects, nothing to lock */
1629 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1631 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1633 RETURN(PTR_ERR(node));
1635 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1636 rc = ll_tree_lock(&tree, node, NULL, count,
1637 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1641 ll_inode_size_lock(inode, 1);
1643 * Consistency guarantees: following possibilities exist for the
1644 * relation between region being read and real file size at this
1647 * (A): the region is completely inside of the file;
1649 * (B-x): x bytes of region are inside of the file, the rest is
1652 * (C): the region is completely outside of the file.
1654 * This classification is stable under DLM lock acquired by
1655 * ll_tree_lock() above, because to change class, other client has to
1656 * take DLM lock conflicting with our lock. Also, any updates to
1657 * ->i_size by other threads on this client are serialized by
1658 * ll_inode_size_lock(). This guarantees that short reads are handled
1659 * correctly in the face of concurrent writes and truncates.
1661 inode_init_lvb(inode, &lvb);
1662 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1664 if (*ppos + count - 1 > kms) {
1665 /* A glimpse is necessary to determine whether we return a
1666 * short read (B) or some zeroes at the end of the buffer (C) */
1667 ll_inode_size_unlock(inode, 1);
1668 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1672 /* region is within kms and, hence, within real file size (A) */
1673 i_size_write(inode, kms);
1674 ll_inode_size_unlock(inode, 1);
1677 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1678 inode->i_ino, count, *ppos, i_size_read(inode));
1680 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1681 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1682 ll_ra_read_in(in_file, &bead);
1684 file_accessed(in_file);
1685 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1686 ll_ra_read_ex(in_file, &bead);
1689 ll_tree_unlock(&tree);
1693 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1696 struct ll_inode_info *lli = ll_i2info(inode);
1697 struct obd_export *exp = ll_i2dtexp(inode);
1698 struct ll_recreate_obj ucreatp;
1699 struct obd_trans_info oti = { 0 };
1700 struct obdo *oa = NULL;
1703 struct lov_stripe_md *lsm, *lsm2;
1706 if (!capable (CAP_SYS_ADMIN))
1709 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1710 sizeof(struct ll_recreate_obj));
1718 down(&lli->lli_size_sem);
1721 GOTO(out, rc = -ENOENT);
1722 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1723 (lsm->lsm_stripe_count));
1725 OBD_ALLOC(lsm2, lsm_size);
1727 GOTO(out, rc = -ENOMEM);
1729 oa->o_id = ucreatp.lrc_id;
1730 oa->o_gr = ucreatp.lrc_group;
1731 oa->o_nlink = ucreatp.lrc_ost_idx;
1732 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1733 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1734 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1735 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1737 memcpy(lsm2, lsm, lsm_size);
1738 rc = obd_create(exp, oa, &lsm2, &oti);
1740 OBD_FREE(lsm2, lsm_size);
1743 up(&lli->lli_size_sem);
1748 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1749 int flags, struct lov_user_md *lum, int lum_size)
1751 struct ll_inode_info *lli = ll_i2info(inode);
1752 struct lov_stripe_md *lsm;
1753 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1757 down(&lli->lli_size_sem);
1760 up(&lli->lli_size_sem);
1761 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1766 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1769 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1770 GOTO(out_req_free, rc = -ENOENT);
1771 rc = oit.d.lustre.it_status;
1773 GOTO(out_req_free, rc);
1775 ll_release_openhandle(file->f_dentry, &oit);
1778 up(&lli->lli_size_sem);
1779 ll_intent_release(&oit);
1782 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1786 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1787 struct lov_mds_md **lmmp, int *lmm_size,
1788 struct ptlrpc_request **request)
1790 struct ll_sb_info *sbi = ll_i2sbi(inode);
1791 struct mdt_body *body;
1792 struct lov_mds_md *lmm = NULL;
1793 struct ptlrpc_request *req = NULL;
1794 struct obd_capa *oc;
1797 rc = ll_get_max_mdsize(sbi, &lmmsize);
1801 oc = ll_mdscapa_get(inode);
1802 rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1803 oc, filename, strlen(filename) + 1,
1804 OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1805 ll_i2suppgid(inode), &req);
1808 CDEBUG(D_INFO, "md_getattr_name failed "
1809 "on %s: rc %d\n", filename, rc);
1813 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1814 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1816 lmmsize = body->eadatasize;
1818 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1820 GOTO(out, rc = -ENODATA);
1823 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1824 LASSERT(lmm != NULL);
1827 * This is coming from the MDS, so is probably in
1828 * little endian. We convert it to host endian before
1829 * passing it to userspace.
1831 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1832 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1833 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1834 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1835 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1838 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1839 struct lov_stripe_md *lsm;
1840 struct lov_user_md_join *lmj;
1841 int lmj_size, i, aindex = 0;
1843 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1845 GOTO(out, rc = -ENOMEM);
1846 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1848 GOTO(out_free_memmd, rc);
1850 lmj_size = sizeof(struct lov_user_md_join) +
1851 lsm->lsm_stripe_count *
1852 sizeof(struct lov_user_ost_data_join);
1853 OBD_ALLOC(lmj, lmj_size);
1855 GOTO(out_free_memmd, rc = -ENOMEM);
1857 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1858 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1859 struct lov_extent *lex =
1860 &lsm->lsm_array->lai_ext_array[aindex];
1862 if (lex->le_loi_idx + lex->le_stripe_count <= i)
1864 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1865 LPU64" len %d\n", aindex, i,
1866 lex->le_start, (int)lex->le_len);
1867 lmj->lmm_objects[i].l_extent_start =
1870 if ((int)lex->le_len == -1)
1871 lmj->lmm_objects[i].l_extent_end = -1;
1873 lmj->lmm_objects[i].l_extent_end =
1874 lex->le_start + lex->le_len;
1875 lmj->lmm_objects[i].l_object_id =
1876 lsm->lsm_oinfo[i]->loi_id;
1877 lmj->lmm_objects[i].l_object_gr =
1878 lsm->lsm_oinfo[i]->loi_gr;
1879 lmj->lmm_objects[i].l_ost_gen =
1880 lsm->lsm_oinfo[i]->loi_ost_gen;
1881 lmj->lmm_objects[i].l_ost_idx =
1882 lsm->lsm_oinfo[i]->loi_ost_idx;
1884 lmm = (struct lov_mds_md *)lmj;
1887 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1891 *lmm_size = lmmsize;
1896 static int ll_lov_setea(struct inode *inode, struct file *file,
1899 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1900 struct lov_user_md *lump;
1901 int lum_size = sizeof(struct lov_user_md) +
1902 sizeof(struct lov_user_ost_data);
1906 if (!capable (CAP_SYS_ADMIN))
1909 OBD_ALLOC(lump, lum_size);
1913 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
1915 OBD_FREE(lump, lum_size);
1919 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1921 OBD_FREE(lump, lum_size);
1925 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1928 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1930 int flags = FMODE_WRITE;
1933 /* Bug 1152: copy properly when this is no longer true */
1934 LASSERT(sizeof(lum) == sizeof(*lump));
1935 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1936 rc = copy_from_user(&lum, lump, sizeof(lum));
1940 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1942 put_user(0, &lump->lmm_stripe_count);
1943 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1944 0, ll_i2info(inode)->lli_smd, lump);
1949 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1951 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1956 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1960 static int ll_get_grouplock(struct inode *inode, struct file *file,
1963 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1964 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1965 .end = OBD_OBJECT_EOF}};
1966 struct lustre_handle lockh = { 0 };
1967 struct ll_inode_info *lli = ll_i2info(inode);
1968 struct lov_stripe_md *lsm = lli->lli_smd;
1972 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1976 policy.l_extent.gid = arg;
1977 if (file->f_flags & O_NONBLOCK)
1978 flags = LDLM_FL_BLOCK_NOWAIT;
1980 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1984 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1986 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1991 static int ll_put_grouplock(struct inode *inode, struct file *file,
1994 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1995 struct ll_inode_info *lli = ll_i2info(inode);
1996 struct lov_stripe_md *lsm = lli->lli_smd;
2000 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2001 /* Ugh, it's already unlocked. */
2005 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2008 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2010 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2015 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2020 static int join_sanity_check(struct inode *head, struct inode *tail)
2023 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2024 CERROR("server do not support join \n");
2027 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2028 CERROR("tail ino %lu and ino head %lu must be regular\n",
2029 head->i_ino, tail->i_ino);
2032 if (head->i_ino == tail->i_ino) {
2033 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2036 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2037 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2043 static int join_file(struct inode *head_inode, struct file *head_filp,
2044 struct file *tail_filp)
2046 struct dentry *tail_dentry = tail_filp->f_dentry;
2047 struct lookup_intent oit = {.it_op = IT_OPEN,
2048 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2049 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2050 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL };
2052 struct lustre_handle lockh;
2053 struct md_op_data *op_data;
2058 tail_dentry = tail_filp->f_dentry;
2060 data = i_size_read(head_inode);
2061 op_data = ll_prep_md_op_data(NULL, head_inode,
2062 tail_dentry->d_parent->d_inode,
2063 tail_dentry->d_name.name,
2064 tail_dentry->d_name.len, 0,
2065 LUSTRE_OPC_ANY, &data);
2066 if (IS_ERR(op_data))
2067 RETURN(PTR_ERR(op_data));
2069 rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2070 op_data, &lockh, NULL, 0, 0);
2072 ll_finish_md_op_data(op_data);
2076 rc = oit.d.lustre.it_status;
2078 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2079 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2080 ptlrpc_req_finished((struct ptlrpc_request *)
2081 oit.d.lustre.it_data);
2085 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2087 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2088 oit.d.lustre.it_lock_mode = 0;
2090 ll_release_openhandle(head_filp->f_dentry, &oit);
2092 ll_intent_release(&oit);
2096 static int ll_file_join(struct inode *head, struct file *filp,
2097 char *filename_tail)
2099 struct inode *tail = NULL, *first = NULL, *second = NULL;
2100 struct dentry *tail_dentry;
2101 struct file *tail_filp, *first_filp, *second_filp;
2102 struct ll_lock_tree first_tree, second_tree;
2103 struct ll_lock_tree_node *first_node, *second_node;
2104 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2105 int rc = 0, cleanup_phase = 0;
2108 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2109 head->i_ino, head->i_generation, head, filename_tail);
2111 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2112 if (IS_ERR(tail_filp)) {
2113 CERROR("Can not open tail file %s", filename_tail);
2114 rc = PTR_ERR(tail_filp);
2117 tail = igrab(tail_filp->f_dentry->d_inode);
2119 tlli = ll_i2info(tail);
2120 tail_dentry = tail_filp->f_dentry;
2121 LASSERT(tail_dentry);
2124 /*reorder the inode for lock sequence*/
2125 first = head->i_ino > tail->i_ino ? head : tail;
2126 second = head->i_ino > tail->i_ino ? tail : head;
2127 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2128 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2130 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2131 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2132 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2133 if (IS_ERR(first_node)){
2134 rc = PTR_ERR(first_node);
2137 first_tree.lt_fd = first_filp->private_data;
2138 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2143 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2144 if (IS_ERR(second_node)){
2145 rc = PTR_ERR(second_node);
2148 second_tree.lt_fd = second_filp->private_data;
2149 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2154 rc = join_sanity_check(head, tail);
2158 rc = join_file(head, filp, tail_filp);
2162 switch (cleanup_phase) {
2164 ll_tree_unlock(&second_tree);
2165 obd_cancel_unused(ll_i2dtexp(second),
2166 ll_i2info(second)->lli_smd, 0, NULL);
2168 ll_tree_unlock(&first_tree);
2169 obd_cancel_unused(ll_i2dtexp(first),
2170 ll_i2info(first)->lli_smd, 0, NULL);
2172 filp_close(tail_filp, 0);
2175 if (head && rc == 0) {
2176 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2178 hlli->lli_smd = NULL;
2183 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2189 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2191 struct inode *inode = dentry->d_inode;
2192 struct obd_client_handle *och;
2198 /* Root ? Do nothing. */
2199 if (dentry->d_inode->i_sb->s_root == dentry)
2202 /* No open handle to close? Move away */
2203 if (!it_disposition(it, DISP_OPEN_OPEN))
2206 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2208 OBD_ALLOC(och, sizeof(*och));
2210 GOTO(out, rc = -ENOMEM);
2212 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2213 ll_i2info(inode), it, och);
2215 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2218 /* this one is in place of ll_file_open */
2219 ptlrpc_req_finished(it->d.lustre.it_data);
2220 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2224 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2227 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2231 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2232 inode->i_generation, inode, cmd);
2233 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2235 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2236 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2240 case LL_IOC_GETFLAGS:
2241 /* Get the current value of the file flags */
2242 return put_user(fd->fd_flags, (int *)arg);
2243 case LL_IOC_SETFLAGS:
2244 case LL_IOC_CLRFLAGS:
2245 /* Set or clear specific file flags */
2246 /* XXX This probably needs checks to ensure the flags are
2247 * not abused, and to handle any flag side effects.
2249 if (get_user(flags, (int *) arg))
2252 if (cmd == LL_IOC_SETFLAGS) {
2253 if ((flags & LL_FILE_IGNORE_LOCK) &&
2254 !(file->f_flags & O_DIRECT)) {
2255 CERROR("%s: unable to disable locking on "
2256 "non-O_DIRECT file\n", current->comm);
2260 fd->fd_flags |= flags;
2262 fd->fd_flags &= ~flags;
2265 case LL_IOC_LOV_SETSTRIPE:
2266 RETURN(ll_lov_setstripe(inode, file, arg));
2267 case LL_IOC_LOV_SETEA:
2268 RETURN(ll_lov_setea(inode, file, arg));
2269 case LL_IOC_LOV_GETSTRIPE:
2270 RETURN(ll_lov_getstripe(inode, arg));
2271 case LL_IOC_RECREATE_OBJ:
2272 RETURN(ll_lov_recreate_obj(inode, file, arg));
2273 case EXT3_IOC_GETFLAGS:
2274 case EXT3_IOC_SETFLAGS:
2275 RETURN(ll_iocontrol(inode, file, cmd, arg));
2276 case EXT3_IOC_GETVERSION_OLD:
2277 case EXT3_IOC_GETVERSION:
2278 RETURN(put_user(inode->i_generation, (int *)arg));
2283 ftail = getname((const char *)arg);
2285 RETURN(PTR_ERR(ftail));
2286 rc = ll_file_join(inode, file, ftail);
2290 case LL_IOC_GROUP_LOCK:
2291 RETURN(ll_get_grouplock(inode, file, arg));
2292 case LL_IOC_GROUP_UNLOCK:
2293 RETURN(ll_put_grouplock(inode, file, arg));
2294 case IOC_OBD_STATFS:
2295 RETURN(ll_obd_statfs(inode, (void *)arg));
2297 /* We need to special case any other ioctls we want to handle,
2298 * to send them to the MDS/OST as appropriate and to properly
2299 * network encode the arg field.
2300 case EXT3_IOC_SETVERSION_OLD:
2301 case EXT3_IOC_SETVERSION:
2303 case LL_IOC_FLUSHCTX:
2304 RETURN(ll_flush_ctx(inode));
2309 ll_iocontrol_call(inode, file, cmd, arg, &err))
2312 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2318 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2320 struct inode *inode = file->f_dentry->d_inode;
2321 struct ll_inode_info *lli = ll_i2info(inode);
2322 struct lov_stripe_md *lsm = lli->lli_smd;
2325 retval = offset + ((origin == 2) ? i_size_read(inode) :
2326 (origin == 1) ? file->f_pos : 0);
2327 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2328 inode->i_ino, inode->i_generation, inode, retval, retval,
2329 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2330 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2332 if (origin == 2) { /* SEEK_END */
2333 int nonblock = 0, rc;
2335 if (file->f_flags & O_NONBLOCK)
2336 nonblock = LDLM_FL_BLOCK_NOWAIT;
2339 rc = ll_glimpse_size(inode, nonblock);
2344 ll_inode_size_lock(inode, 0);
2345 offset += i_size_read(inode);
2346 ll_inode_size_unlock(inode, 0);
2347 } else if (origin == 1) { /* SEEK_CUR */
2348 offset += file->f_pos;
2352 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2353 if (offset != file->f_pos) {
2354 file->f_pos = offset;
2355 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2357 file->f_version = ++event;
2366 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2368 struct inode *inode = dentry->d_inode;
2369 struct ll_inode_info *lli = ll_i2info(inode);
2370 struct lov_stripe_md *lsm = lli->lli_smd;
2371 struct ptlrpc_request *req;
2372 struct obd_capa *oc;
2375 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2376 inode->i_generation, inode);
2377 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2379 /* fsync's caller has already called _fdata{sync,write}, we want
2380 * that IO to finish before calling the osc and mdc sync methods */
2381 rc = filemap_fdatawait(inode->i_mapping);
2383 /* catch async errors that were recorded back when async writeback
2384 * failed for pages in this mapping. */
2385 err = lli->lli_async_rc;
2386 lli->lli_async_rc = 0;
2390 err = lov_test_and_clear_async_rc(lsm);
2395 oc = ll_mdscapa_get(inode);
2396 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2402 ptlrpc_req_finished(req);
2409 RETURN(rc ? rc : -ENOMEM);
2411 oa->o_id = lsm->lsm_object_id;
2412 oa->o_gr = lsm->lsm_object_gr;
2413 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2414 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2415 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2418 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2419 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2420 0, OBD_OBJECT_EOF, oc);
2430 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2432 struct inode *inode = file->f_dentry->d_inode;
2433 struct ll_sb_info *sbi = ll_i2sbi(inode);
2434 struct ldlm_res_id res_id =
2435 { .name = { fid_seq(ll_inode2fid(inode)),
2436 fid_oid(ll_inode2fid(inode)),
2437 fid_ver(ll_inode2fid(inode)),
2439 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2440 ldlm_flock_completion_ast, NULL, file_lock };
2441 struct lustre_handle lockh = {0};
2442 ldlm_policy_data_t flock;
2447 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2448 inode->i_ino, file_lock);
2450 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2452 if (file_lock->fl_flags & FL_FLOCK) {
2453 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2454 /* set missing params for flock() calls */
2455 file_lock->fl_end = OFFSET_MAX;
2456 file_lock->fl_pid = current->tgid;
2458 flock.l_flock.pid = file_lock->fl_pid;
2459 flock.l_flock.start = file_lock->fl_start;
2460 flock.l_flock.end = file_lock->fl_end;
2462 switch (file_lock->fl_type) {
2464 einfo.ei_mode = LCK_PR;
2467 /* An unlock request may or may not have any relation to
2468 * existing locks so we may not be able to pass a lock handle
2469 * via a normal ldlm_lock_cancel() request. The request may even
2470 * unlock a byte range in the middle of an existing lock. In
2471 * order to process an unlock request we need all of the same
2472 * information that is given with a normal read or write record
2473 * lock request. To avoid creating another ldlm unlock (cancel)
2474 * message we'll treat a LCK_NL flock request as an unlock. */
2475 einfo.ei_mode = LCK_NL;
2478 einfo.ei_mode = LCK_PW;
2481 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2496 flags = LDLM_FL_BLOCK_NOWAIT;
2502 flags = LDLM_FL_TEST_LOCK;
2503 /* Save the old mode so that if the mode in the lock changes we
2504 * can decrement the appropriate reader or writer refcount. */
2505 file_lock->fl_type = einfo.ei_mode;
2508 CERROR("unknown fcntl lock command: %d\n", cmd);
2512 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2513 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2514 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2516 rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &einfo, &res_id,
2517 &flock, &flags, NULL, 0, NULL, &lockh, 0);
2518 if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2519 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2520 #ifdef HAVE_F_OP_FLOCK
2521 if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2522 !(flags & LDLM_FL_TEST_LOCK))
2523 posix_lock_file_wait(file, file_lock);
2529 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2536 int ll_have_md_lock(struct inode *inode, __u64 bits)
2538 struct lustre_handle lockh;
2539 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2547 fid = &ll_i2info(inode)->lli_fid;
2548 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2550 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2551 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2552 LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2558 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2559 struct lustre_handle *lockh)
2561 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2567 fid = &ll_i2info(inode)->lli_fid;
2568 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2570 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2571 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2572 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2576 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2577 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2578 * and return success */
2580 /* This path cannot be hit for regular files unless in
2581 * case of obscure races, so no need to to validate
2583 if (!S_ISREG(inode->i_mode) &&
2584 !S_ISDIR(inode->i_mode))
2589 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2597 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2599 struct inode *inode = dentry->d_inode;
2600 struct ptlrpc_request *req = NULL;
2601 struct ll_sb_info *sbi;
2602 struct obd_export *exp;
2607 CERROR("REPORT THIS LINE TO PETER\n");
2610 sbi = ll_i2sbi(inode);
2612 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2613 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2615 exp = ll_i2mdexp(inode);
2617 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2618 struct lookup_intent oit = { .it_op = IT_GETATTR };
2619 struct md_op_data *op_data;
2621 /* Call getattr by fid, so do not provide name at all. */
2622 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2623 dentry->d_inode, NULL, 0, 0,
2624 LUSTRE_OPC_ANY, NULL);
2625 if (IS_ERR(op_data))
2626 RETURN(PTR_ERR(op_data));
2628 oit.it_flags |= O_CHECK_STALE;
2629 rc = md_intent_lock(exp, op_data, NULL, 0,
2630 /* we are not interested in name
2633 ll_md_blocking_ast, 0);
2634 ll_finish_md_op_data(op_data);
2635 oit.it_flags &= ~O_CHECK_STALE;
2637 rc = ll_inode_revalidate_fini(inode, rc);
2641 rc = ll_revalidate_it_finish(req, &oit, dentry);
2643 ll_intent_release(&oit);
2647 /* Unlinked? Unhash dentry, so it is not picked up later by
2648 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2649 here to preserve get_cwd functionality on 2.6.
2651 if (!dentry->d_inode->i_nlink) {
2652 spin_lock(&dcache_lock);
2653 ll_drop_dentry(dentry);
2654 spin_unlock(&dcache_lock);
2657 ll_lookup_finish_locks(&oit, dentry);
2658 } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
2659 MDS_INODELOCK_LOOKUP)) {
2660 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2661 obd_valid valid = OBD_MD_FLGETATTR;
2662 struct obd_capa *oc;
2665 if (S_ISREG(inode->i_mode)) {
2666 rc = ll_get_max_mdsize(sbi, &ealen);
2669 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2671 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2672 * capa for this inode. Because we only keep capas of dirs
2674 oc = ll_mdscapa_get(inode);
2675 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2679 rc = ll_inode_revalidate_fini(inode, rc);
2683 rc = ll_prep_inode(&inode, req, NULL);
2688 /* if object not yet allocated, don't validate size */
2689 if (ll_i2info(inode)->lli_smd == NULL)
2692 /* ll_glimpse_size will prefer locally cached writes if they extend
2694 rc = ll_glimpse_size(inode, 0);
2697 ptlrpc_req_finished(req);
2701 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2702 struct lookup_intent *it, struct kstat *stat)
2704 struct inode *inode = de->d_inode;
2707 res = ll_inode_revalidate_it(de, it);
2708 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2713 stat->dev = inode->i_sb->s_dev;
2714 stat->ino = inode->i_ino;
2715 stat->mode = inode->i_mode;
2716 stat->nlink = inode->i_nlink;
2717 stat->uid = inode->i_uid;
2718 stat->gid = inode->i_gid;
2719 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2720 stat->atime = inode->i_atime;
2721 stat->mtime = inode->i_mtime;
2722 stat->ctime = inode->i_ctime;
2723 #ifdef HAVE_INODE_BLKSIZE
2724 stat->blksize = inode->i_blksize;
2726 stat->blksize = 1 << inode->i_blkbits;
2729 ll_inode_size_lock(inode, 0);
2730 stat->size = i_size_read(inode);
2731 stat->blocks = inode->i_blocks;
2732 ll_inode_size_unlock(inode, 0);
2736 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2738 struct lookup_intent it = { .it_op = IT_GETATTR };
2740 return ll_getattr_it(mnt, de, &it, stat);
2744 int lustre_check_acl(struct inode *inode, int mask)
2746 #ifdef CONFIG_FS_POSIX_ACL
2747 struct ll_inode_info *lli = ll_i2info(inode);
2748 struct posix_acl *acl;
2752 spin_lock(&lli->lli_lock);
2753 acl = posix_acl_dup(lli->lli_posix_acl);
2754 spin_unlock(&lli->lli_lock);
2759 rc = posix_acl_permission(inode, acl, mask);
2760 posix_acl_release(acl);
2768 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2769 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2771 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2772 inode->i_ino, inode->i_generation, inode, mask);
2773 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2774 return lustre_check_remote_perm(inode, mask);
2776 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2777 return generic_permission(inode, mask, lustre_check_acl);
2780 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2782 int mode = inode->i_mode;
2785 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2786 inode->i_ino, inode->i_generation, inode, mask);
2788 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2789 return lustre_check_remote_perm(inode, mask);
2791 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2793 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2794 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2796 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2798 if (current->fsuid == inode->i_uid) {
2801 if (((mode >> 3) & mask & S_IRWXO) != mask)
2803 rc = lustre_check_acl(inode, mask);
2807 goto check_capabilities;
2811 if (in_group_p(inode->i_gid))
2814 if ((mode & mask & S_IRWXO) == mask)
2818 if (!(mask & MAY_EXEC) ||
2819 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2820 if (capable(CAP_DAC_OVERRIDE))
2823 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2824 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2831 /* -o localflock - only provides locally consistent flock locks */
2832 struct file_operations ll_file_operations = {
2833 .read = ll_file_read,
2834 .write = ll_file_write,
2835 .ioctl = ll_file_ioctl,
2836 .open = ll_file_open,
2837 .release = ll_file_release,
2838 .mmap = ll_file_mmap,
2839 .llseek = ll_file_seek,
2840 .sendfile = ll_file_sendfile,
2844 struct file_operations ll_file_operations_flock = {
2845 .read = ll_file_read,
2846 .write = ll_file_write,
2847 .ioctl = ll_file_ioctl,
2848 .open = ll_file_open,
2849 .release = ll_file_release,
2850 .mmap = ll_file_mmap,
2851 .llseek = ll_file_seek,
2852 .sendfile = ll_file_sendfile,
2854 #ifdef HAVE_F_OP_FLOCK
2855 .flock = ll_file_flock,
2857 .lock = ll_file_flock
2860 /* These are for -o noflock - to return ENOSYS on flock calls */
2861 struct file_operations ll_file_operations_noflock = {
2862 .read = ll_file_read,
2863 .write = ll_file_write,
2864 .ioctl = ll_file_ioctl,
2865 .open = ll_file_open,
2866 .release = ll_file_release,
2867 .mmap = ll_file_mmap,
2868 .llseek = ll_file_seek,
2869 .sendfile = ll_file_sendfile,
2871 #ifdef HAVE_F_OP_FLOCK
2872 .flock = ll_file_noflock,
2874 .lock = ll_file_noflock
2877 struct inode_operations ll_file_inode_operations = {
2878 #ifdef HAVE_VFS_INTENT_PATCHES
2879 .setattr_raw = ll_setattr_raw,
2881 .setattr = ll_setattr,
2882 .truncate = ll_truncate,
2883 .getattr = ll_getattr,
2884 .permission = ll_inode_permission,
2885 .setxattr = ll_setxattr,
2886 .getxattr = ll_getxattr,
2887 .listxattr = ll_listxattr,
2888 .removexattr = ll_removexattr,
2891 /* dynamic ioctl number support routins */
2892 static struct llioc_ctl_data {
2893 struct rw_semaphore ioc_sem;
2894 struct list_head ioc_head;
2896 __RWSEM_INITIALIZER(llioc.ioc_sem),
2897 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2902 struct list_head iocd_list;
2903 unsigned int iocd_size;
2904 llioc_callback_t iocd_cb;
2905 unsigned int iocd_count;
2906 unsigned int iocd_cmd[0];
2909 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2912 struct llioc_data *in_data = NULL;
2915 if (cb == NULL || cmd == NULL ||
2916 count > LLIOC_MAX_CMD || count < 0)
2919 size = sizeof(*in_data) + count * sizeof(unsigned int);
2920 OBD_ALLOC(in_data, size);
2921 if (in_data == NULL)
2924 memset(in_data, 0, sizeof(*in_data));
2925 in_data->iocd_size = size;
2926 in_data->iocd_cb = cb;
2927 in_data->iocd_count = count;
2928 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2930 down_write(&llioc.ioc_sem);
2931 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2932 up_write(&llioc.ioc_sem);
2937 void ll_iocontrol_unregister(void *magic)
2939 struct llioc_data *tmp;
2944 down_write(&llioc.ioc_sem);
2945 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2947 unsigned int size = tmp->iocd_size;
2949 list_del(&tmp->iocd_list);
2950 up_write(&llioc.ioc_sem);
2952 OBD_FREE(tmp, size);
2956 up_write(&llioc.ioc_sem);
2958 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2961 EXPORT_SYMBOL(ll_iocontrol_register);
2962 EXPORT_SYMBOL(ll_iocontrol_unregister);
2964 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2965 unsigned int cmd, unsigned long arg, int *rcp)
2967 enum llioc_iter ret = LLIOC_CONT;
2968 struct llioc_data *data;
2969 int rc = -EINVAL, i;
2971 down_read(&llioc.ioc_sem);
2972 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2973 for (i = 0; i < data->iocd_count; i++) {
2974 if (cmd != data->iocd_cmd[i])
2977 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2981 if (ret == LLIOC_STOP)
2984 up_read(&llioc.ioc_sem);