1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <lustre_mdc.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include "llite_internal.h"
33 /* also used by llite/special.c:ll_special_open() */
34 struct ll_file_data *ll_file_data_get(void)
36 struct ll_file_data *fd;
38 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
42 static void ll_file_data_put(struct ll_file_data *fd)
45 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
48 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
49 struct lustre_handle *fh)
51 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
52 op_data->op_attr.ia_mode = inode->i_mode;
53 op_data->op_attr.ia_atime = inode->i_atime;
54 op_data->op_attr.ia_mtime = inode->i_mtime;
55 op_data->op_attr.ia_ctime = inode->i_ctime;
56 op_data->op_attr.ia_size = i_size_read(inode);
57 op_data->op_attr_blocks = inode->i_blocks;
58 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
59 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
60 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
61 op_data->op_capa1 = ll_mdscapa_get(inode);
64 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
65 struct obd_client_handle *och)
69 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
70 ATTR_MTIME_SET | ATTR_CTIME_SET;
72 if (!(och->och_flags & FMODE_WRITE))
75 if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
76 !S_ISREG(inode->i_mode))
77 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
79 ll_epoch_close(inode, op_data, &och, 0);
82 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
86 static int ll_close_inode_openhandle(struct obd_export *md_exp,
88 struct obd_client_handle *och)
90 struct obd_export *exp = ll_i2mdexp(inode);
91 struct md_op_data *op_data;
92 struct ptlrpc_request *req = NULL;
93 struct obd_device *obd = class_exp2obd(exp);
100 * XXX: in case of LMV, is this correct to access
103 CERROR("Invalid MDC connection handle "LPX64"\n",
104 ll_i2mdexp(inode)->exp_handle.h_cookie);
109 * here we check if this is forced umount. If so this is called on
110 * canceling "open lock" and we do not call md_close() in this case, as
111 * it will not be successful, as import is already deactivated.
116 OBD_ALLOC_PTR(op_data);
118 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
120 ll_prepare_close(inode, op_data, och);
121 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
122 rc = md_close(md_exp, op_data, och->och_mod, &req);
127 /* This close must have the epoch closed. */
128 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
129 LASSERT(epoch_close);
130 /* MDS has instructed us to obtain Size-on-MDS attribute from
131 * OSTs and send setattr to back to MDS. */
132 rc = ll_sizeonmds_update(inode, och->och_mod,
133 &och->och_fh, op_data->op_ioepoch);
135 CERROR("inode %lu mdc Size-on-MDS update failed: "
136 "rc = %d\n", inode->i_ino, rc);
140 CERROR("inode %lu mdc close failed: rc = %d\n",
143 ll_finish_md_op_data(op_data);
146 rc = ll_objects_destroy(req, inode);
148 CERROR("inode %lu ll_objects destroy: rc = %d\n",
155 if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
156 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
157 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
160 ptlrpc_close_replay_seq(req);
161 md_clear_open_replay_data(md_exp, och);
162 /* Free @och if it is not waiting for DONE_WRITING. */
163 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
166 if (req) /* This is close request */
167 ptlrpc_req_finished(req);
171 int ll_md_real_close(struct inode *inode, int flags)
173 struct ll_inode_info *lli = ll_i2info(inode);
174 struct obd_client_handle **och_p;
175 struct obd_client_handle *och;
180 if (flags & FMODE_WRITE) {
181 och_p = &lli->lli_mds_write_och;
182 och_usecount = &lli->lli_open_fd_write_count;
183 } else if (flags & FMODE_EXEC) {
184 och_p = &lli->lli_mds_exec_och;
185 och_usecount = &lli->lli_open_fd_exec_count;
187 LASSERT(flags & FMODE_READ);
188 och_p = &lli->lli_mds_read_och;
189 och_usecount = &lli->lli_open_fd_read_count;
192 down(&lli->lli_och_sem);
193 if (*och_usecount) { /* There are still users of this handle, so
195 up(&lli->lli_och_sem);
200 up(&lli->lli_och_sem);
202 if (och) { /* There might be a race and somebody have freed this och
204 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
211 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
214 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
215 struct ll_inode_info *lli = ll_i2info(inode);
219 /* clear group lock, if present */
220 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
221 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
222 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
223 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
227 /* Let's see if we have good enough OPEN lock on the file and if
228 we can skip talking to MDS */
229 if (file->f_dentry->d_inode) { /* Can this ever be false? */
231 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
232 struct lustre_handle lockh;
233 struct inode *inode = file->f_dentry->d_inode;
234 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
236 down(&lli->lli_och_sem);
237 if (fd->fd_omode & FMODE_WRITE) {
239 LASSERT(lli->lli_open_fd_write_count);
240 lli->lli_open_fd_write_count--;
241 } else if (fd->fd_omode & FMODE_EXEC) {
243 LASSERT(lli->lli_open_fd_exec_count);
244 lli->lli_open_fd_exec_count--;
247 LASSERT(lli->lli_open_fd_read_count);
248 lli->lli_open_fd_read_count--;
250 up(&lli->lli_och_sem);
252 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
253 LDLM_IBITS, &policy, lockmode,
255 rc = ll_md_real_close(file->f_dentry->d_inode,
259 CERROR("Releasing a file %p with negative dentry %p. Name %s",
260 file, file->f_dentry, file->f_dentry->d_name.name);
263 LUSTRE_FPRIVATE(file) = NULL;
264 ll_file_data_put(fd);
265 ll_capa_close(inode);
270 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
272 /* While this returns an error code, fput() the caller does not, so we need
273 * to make every effort to clean up all of our state here. Also, applications
274 * rarely check close errors and even if an error is returned they will not
275 * re-try the close call.
277 int ll_file_release(struct inode *inode, struct file *file)
279 struct ll_file_data *fd;
280 struct ll_sb_info *sbi = ll_i2sbi(inode);
281 struct ll_inode_info *lli = ll_i2info(inode);
282 struct lov_stripe_md *lsm = lli->lli_smd;
286 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
287 inode->i_generation, inode);
289 /* don't do anything for / */
290 if (inode->i_sb->s_root == file->f_dentry)
293 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
294 fd = LUSTRE_FPRIVATE(file);
297 /* don't do anything for / */
298 if (inode->i_sb->s_root == file->f_dentry) {
299 LUSTRE_FPRIVATE(file) = NULL;
300 ll_file_data_put(fd);
305 lov_test_and_clear_async_rc(lsm);
306 lli->lli_async_rc = 0;
308 rc = ll_md_close(sbi->ll_md_exp, inode, file);
312 static int ll_intent_file_open(struct file *file, void *lmm,
313 int lmmsize, struct lookup_intent *itp)
315 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
316 struct dentry *parent = file->f_dentry->d_parent;
317 const char *name = file->f_dentry->d_name.name;
318 const int len = file->f_dentry->d_name.len;
319 struct md_op_data *op_data;
320 struct ptlrpc_request *req;
326 /* Usually we come here only for NFSD, and we want open lock.
327 But we can also get here with pre 2.6.15 patchless kernels, and in
328 that case that lock is also ok */
329 /* We can also get here if there was cached open handle in revalidate_it
330 * but it disappeared while we were getting from there to ll_file_open.
331 * But this means this file was closed and immediatelly opened which
332 * makes a good candidate for using OPEN lock */
333 /* If lmmsize & lmm are not 0, we are just setting stripe info
334 * parameters. No need for the open lock */
335 if (!lmm && !lmmsize)
336 itp->it_flags |= MDS_OPEN_LOCK;
338 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
339 file->f_dentry->d_inode, name, len,
340 O_RDWR, LUSTRE_OPC_ANY, NULL);
342 RETURN(PTR_ERR(op_data));
344 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
345 0 /*unused */, &req, ll_md_blocking_ast, 0);
346 ll_finish_md_op_data(op_data);
348 /* reason for keep own exit path - don`t flood log
349 * with messages with -ESTALE errors.
351 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
352 it_open_error(DISP_OPEN_OPEN, itp))
354 ll_release_openhandle(file->f_dentry, itp);
358 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
359 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
360 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
364 if (itp->d.lustre.it_lock_mode)
365 md_set_lock_data(sbi->ll_md_exp,
366 &itp->d.lustre.it_lock_handle,
367 file->f_dentry->d_inode);
369 rc = ll_prep_inode(&file->f_dentry->d_inode, req, DLM_REPLY_REC_OFF,
372 ptlrpc_req_finished(itp->d.lustre.it_data);
375 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
376 ll_intent_drop_lock(itp);
381 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
382 struct lookup_intent *it, struct obd_client_handle *och)
384 struct ptlrpc_request *req = it->d.lustre.it_data;
385 struct mdt_body *body;
389 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
390 LASSERT(body != NULL); /* reply already checked out */
391 LASSERT_REPSWABBED(req, DLM_REPLY_REC_OFF); /* and swabbed in md_enqueue */
393 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
394 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
395 och->och_fid = lli->lli_fid;
396 och->och_flags = it->it_flags;
397 lli->lli_ioepoch = body->ioepoch;
399 return md_set_open_replay_data(md_exp, och, req);
402 int ll_local_open(struct file *file, struct lookup_intent *it,
403 struct ll_file_data *fd, struct obd_client_handle *och)
405 struct inode *inode = file->f_dentry->d_inode;
406 struct ll_inode_info *lli = ll_i2info(inode);
409 LASSERT(!LUSTRE_FPRIVATE(file));
414 struct ptlrpc_request *req = it->d.lustre.it_data;
415 struct mdt_body *body;
418 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
422 body = lustre_msg_buf(req->rq_repmsg,
423 DLM_REPLY_REC_OFF, sizeof(*body));
425 if ((it->it_flags & FMODE_WRITE) &&
426 (body->valid & OBD_MD_FLSIZE))
428 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
429 lli->lli_ioepoch, PFID(&lli->lli_fid));
433 LUSTRE_FPRIVATE(file) = fd;
434 ll_readahead_init(inode, &fd->fd_ras);
435 fd->fd_omode = it->it_flags;
439 /* Open a file, and (for the very first open) create objects on the OSTs at
440 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
441 * creation or open until ll_lov_setstripe() ioctl is called. We grab
442 * lli_open_sem to ensure no other process will create objects, send the
443 * stripe MD to the MDS, or try to destroy the objects if that fails.
445 * If we already have the stripe MD locally then we don't request it in
446 * md_open(), by passing a lmm_size = 0.
448 * It is up to the application to ensure no other processes open this file
449 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
450 * used. We might be able to avoid races of that sort by getting lli_open_sem
451 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
452 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
454 int ll_file_open(struct inode *inode, struct file *file)
456 struct ll_inode_info *lli = ll_i2info(inode);
457 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
458 .it_flags = file->f_flags };
459 struct lov_stripe_md *lsm;
460 struct ptlrpc_request *req = NULL;
461 struct obd_client_handle **och_p;
463 struct ll_file_data *fd;
467 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
468 inode->i_generation, inode, file->f_flags);
470 /* don't do anything for / */
471 if (inode->i_sb->s_root == file->f_dentry)
474 #ifdef LUSTRE_KERNEL_VERSION
477 it = file->private_data; /* XXX: compat macro */
478 file->private_data = NULL; /* prevent ll_local_open assertion */
481 fd = ll_file_data_get();
485 /* don't do anything for / */
486 if (inode->i_sb->s_root == file->f_dentry) {
487 LUSTRE_FPRIVATE(file) = fd;
491 if (!it || !it->d.lustre.it_disposition) {
492 /* Convert f_flags into access mode. We cannot use file->f_mode,
493 * because everything but O_ACCMODE mask was stripped from
495 if ((oit.it_flags + 1) & O_ACCMODE)
497 if (file->f_flags & O_TRUNC)
498 oit.it_flags |= FMODE_WRITE;
500 /* kernel only call f_op->open in dentry_open. filp_open calls
501 * dentry_open after call to open_namei that checks permissions.
502 * Only nfsd_open call dentry_open directly without checking
503 * permissions and because of that this code below is safe. */
504 if (oit.it_flags & FMODE_WRITE)
505 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
507 /* We do not want O_EXCL here, presumably we opened the file
508 * already? XXX - NFS implications? */
509 oit.it_flags &= ~O_EXCL;
514 /* Let's see if we have file open on MDS already. */
515 if (it->it_flags & FMODE_WRITE) {
516 och_p = &lli->lli_mds_write_och;
517 och_usecount = &lli->lli_open_fd_write_count;
518 } else if (it->it_flags & FMODE_EXEC) {
519 och_p = &lli->lli_mds_exec_och;
520 och_usecount = &lli->lli_open_fd_exec_count;
522 och_p = &lli->lli_mds_read_och;
523 och_usecount = &lli->lli_open_fd_read_count;
526 down(&lli->lli_och_sem);
527 if (*och_p) { /* Open handle is present */
528 if (it_disposition(it, DISP_OPEN_OPEN)) {
529 /* Well, there's extra open request that we do not need,
530 let's close it somehow. This will decref request. */
531 rc = it_open_error(DISP_OPEN_OPEN, it);
533 ll_file_data_put(fd);
534 GOTO(out_och_free, rc);
536 ll_release_openhandle(file->f_dentry, it);
537 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
542 rc = ll_local_open(file, it, fd, NULL);
544 up(&lli->lli_och_sem);
545 ll_file_data_put(fd);
549 LASSERT(*och_usecount == 0);
550 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
552 ll_file_data_put(fd);
553 GOTO(out_och_free, rc = -ENOMEM);
556 if (!it->d.lustre.it_disposition) {
557 it->it_flags |= O_CHECK_STALE;
558 rc = ll_intent_file_open(file, NULL, 0, it);
559 it->it_flags &= ~O_CHECK_STALE;
561 ll_file_data_put(fd);
562 GOTO(out_och_free, rc);
565 /* Got some error? Release the request */
566 if (it->d.lustre.it_status < 0) {
567 req = it->d.lustre.it_data;
568 ptlrpc_req_finished(req);
570 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
571 &it->d.lustre.it_lock_handle,
572 file->f_dentry->d_inode);
574 req = it->d.lustre.it_data;
576 /* md_intent_lock() didn't get a request ref if there was an
577 * open error, so don't do cleanup on the request here
579 /* XXX (green): Should not we bail out on any error here, not
580 * just open error? */
581 rc = it_open_error(DISP_OPEN_OPEN, it);
583 ll_file_data_put(fd);
584 GOTO(out_och_free, rc);
587 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
588 rc = ll_local_open(file, it, fd, *och_p);
590 up(&lli->lli_och_sem);
591 ll_file_data_put(fd);
592 GOTO(out_och_free, rc);
595 up(&lli->lli_och_sem);
597 /* Must do this outside lli_och_sem lock to prevent deadlock where
598 different kind of OPEN lock for this same inode gets cancelled
599 by ldlm_cancel_lru */
600 if (!S_ISREG(inode->i_mode))
607 if (file->f_flags & O_LOV_DELAY_CREATE ||
608 !(file->f_mode & FMODE_WRITE)) {
609 CDEBUG(D_INODE, "object creation was delayed\n");
613 file->f_flags &= ~O_LOV_DELAY_CREATE;
616 ptlrpc_req_finished(req);
618 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
622 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
623 *och_p = NULL; /* OBD_FREE writes some magic there */
626 up(&lli->lli_och_sem);
632 /* Fills the obdo with the attributes for the inode defined by lsm */
633 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
635 struct ptlrpc_request_set *set;
636 struct ll_inode_info *lli = ll_i2info(inode);
637 struct lov_stripe_md *lsm = lli->lli_smd;
639 struct obd_info oinfo = { { { 0 } } };
643 LASSERT(lsm != NULL);
647 oinfo.oi_oa->o_id = lsm->lsm_object_id;
648 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
649 oinfo.oi_oa->o_mode = S_IFREG;
650 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
651 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
652 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
653 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
655 oinfo.oi_capa = ll_mdscapa_get(inode);
657 set = ptlrpc_prep_set();
659 CERROR("can't allocate ptlrpc set\n");
662 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
664 rc = ptlrpc_set_wait(set);
665 ptlrpc_set_destroy(set);
667 capa_put(oinfo.oi_capa);
671 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
672 OBD_MD_FLATIME | OBD_MD_FLMTIME |
673 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
675 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
676 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
677 lli->lli_smd->lsm_object_id, i_size_read(inode),
678 inode->i_blocks, inode->i_blksize);
682 static inline void ll_remove_suid(struct inode *inode)
686 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
687 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
689 /* was any of the uid bits set? */
690 mode &= inode->i_mode;
691 if (mode && !capable(CAP_FSETID)) {
692 inode->i_mode &= ~mode;
693 // XXX careful here - we cannot change the size
697 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
699 struct ll_inode_info *lli = ll_i2info(inode);
700 struct lov_stripe_md *lsm = lli->lli_smd;
701 struct obd_export *exp = ll_i2dtexp(inode);
704 struct ldlm_lock *lock;
705 struct lov_stripe_md *lsm;
706 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
707 __u32 stripe, vallen = sizeof(stripe);
711 if (lsm->lsm_stripe_count == 1)
712 GOTO(check, stripe = 0);
714 /* get our offset in the lov */
715 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
717 CERROR("obd_get_info: rc = %d\n", rc);
720 LASSERT(stripe < lsm->lsm_stripe_count);
723 if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
724 lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
725 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
726 lsm->lsm_oinfo[stripe]->loi_id,
727 lsm->lsm_oinfo[stripe]->loi_gr);
728 RETURN(-ELDLM_NO_LOCK_DATA);
734 /* Flush the page cache for an extent as its canceled. When we're on an LOV,
735 * we get a lock cancellation for each stripe, so we have to map the obd's
736 * region back onto the stripes in the file that it held.
738 * No one can dirty the extent until we've finished our work and they can
739 * enqueue another lock. The DLM protects us from ll_file_read/write here,
740 * but other kernel actors could have pages locked.
742 * Called with the DLM lock held. */
743 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
744 struct ldlm_lock *lock, __u32 stripe)
746 ldlm_policy_data_t tmpex;
747 unsigned long start, end, count, skip, i, j;
749 int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
750 struct lustre_handle lockh;
751 struct address_space *mapping = inode->i_mapping;
754 tmpex = lock->l_policy_data;
755 CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
756 inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
759 /* our locks are page granular thanks to osc_enqueue, we invalidate the
761 if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
762 ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
763 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",
765 LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
766 LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
770 start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
771 end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
772 if (lsm->lsm_stripe_count > 1) {
773 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
774 skip = (lsm->lsm_stripe_count - 1) * count;
775 start += start/count * skip + stripe * count;
777 end += end/count * skip + stripe * count;
779 if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
782 i = i_size_read(inode) ? (__u64)(i_size_read(inode) - 1) >>
787 CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
788 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
789 count, skip, end, discard ? " (DISCARDING)" : "");
791 /* walk through the vmas on the inode and tear down mmaped pages that
792 * intersect with the lock. this stops immediately if there are no
793 * mmap()ed regions of the file. This is not efficient at all and
794 * should be short lived. We'll associate mmap()ed pages with the lock
795 * and will be able to find them directly */
796 for (i = start; i <= end; i += (j + skip)) {
797 j = min(count - (i % count), end - i + 1);
800 if (ll_teardown_mmaps(mapping,
801 (__u64)i << CFS_PAGE_SHIFT,
802 ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
806 /* this is the simplistic implementation of page eviction at
807 * cancelation. It is careful to get races with other page
808 * lockers handled correctly. fixes from bug 20 will make it
809 * more efficient by associating locks with pages and with
810 * batching writeback under the lock explicitly. */
811 for (i = start, j = start % count; i <= end;
812 j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
814 CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
820 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
821 LPU64" >= "LPU64" start %lu i %lu end %lu\n",
822 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
825 if (!mapping_has_pages(mapping)) {
826 CDEBUG(D_INODE|D_PAGE, "nothing left\n");
832 page = find_get_page(mapping, i);
835 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
836 i, tmpex.l_extent.start);
839 /* page->mapping to check with racing against teardown */
840 if (!discard && clear_page_dirty_for_io(page)) {
841 rc = ll_call_writepage(inode, page);
842 /* either waiting for io to complete or reacquiring
843 * the lock that the failed writepage released */
845 wait_on_page_writeback(page);
847 CERROR("writepage inode %lu(%p) of page %p "
848 "failed: %d\n", inode->i_ino, inode,
851 set_bit(AS_ENOSPC, &mapping->flags);
853 set_bit(AS_EIO, &mapping->flags);
857 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
858 /* check to see if another DLM lock covers this page b=2765 */
859 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
860 LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
862 &lock->l_resource->lr_name, LDLM_EXTENT,
863 &tmpex, LCK_PR | LCK_PW, &lockh);
865 if (rc2 <= 0 && page->mapping != NULL) {
866 struct ll_async_page *llap = llap_cast_private(page);
867 /* checking again to account for writeback's
869 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
871 ll_ra_accounting(llap, mapping);
872 ll_truncate_complete_page(page);
875 page_cache_release(page);
877 LASSERTF(tmpex.l_extent.start <=
878 (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
879 lock->l_policy_data.l_extent.end + 1),
880 "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
881 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
886 static int ll_extent_lock_callback(struct ldlm_lock *lock,
887 struct ldlm_lock_desc *new, void *data,
890 struct lustre_handle lockh = { 0 };
894 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
895 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
900 case LDLM_CB_BLOCKING:
901 ldlm_lock2handle(lock, &lockh);
902 rc = ldlm_cli_cancel(&lockh);
904 CERROR("ldlm_cli_cancel failed: %d\n", rc);
906 case LDLM_CB_CANCELING: {
908 struct ll_inode_info *lli;
909 struct lov_stripe_md *lsm;
913 /* This lock wasn't granted, don't try to evict pages */
914 if (lock->l_req_mode != lock->l_granted_mode)
917 inode = ll_inode_from_lock(lock);
920 lli = ll_i2info(inode);
923 if (lli->lli_smd == NULL)
927 stripe = ll_lock_to_stripe_offset(inode, lock);
931 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
933 lov_stripe_lock(lsm);
934 lock_res_and_lock(lock);
935 kms = ldlm_extent_shift_kms(lock,
936 lsm->lsm_oinfo[stripe]->loi_kms);
938 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
939 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
940 lsm->lsm_oinfo[stripe]->loi_kms, kms);
941 lsm->lsm_oinfo[stripe]->loi_kms = kms;
942 unlock_res_and_lock(lock);
943 lov_stripe_unlock(lsm);
956 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
958 /* XXX ALLOCATE - 160 bytes */
959 struct inode *inode = ll_inode_from_lock(lock);
960 struct ll_inode_info *lli = ll_i2info(inode);
961 struct lustre_handle lockh = { 0 };
966 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
967 LDLM_FL_BLOCK_CONV)) {
968 LBUG(); /* not expecting any blocked async locks yet */
969 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
971 ldlm_lock_dump(D_OTHER, lock, 0);
972 ldlm_reprocess_all(lock->l_resource);
976 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
978 stripe = ll_lock_to_stripe_offset(inode, lock);
982 if (lock->l_lvb_len) {
983 struct lov_stripe_md *lsm = lli->lli_smd;
985 lvb = lock->l_lvb_data;
986 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
988 lock_res_and_lock(lock);
989 ll_inode_size_lock(inode, 1);
990 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
991 kms = ldlm_extent_shift_kms(NULL, kms);
992 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
993 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
994 lsm->lsm_oinfo[stripe].loi_kms, kms);
995 lsm->lsm_oinfo[stripe].loi_kms = kms;
996 ll_inode_size_unlock(inode, 1);
997 unlock_res_and_lock(lock);
1002 wake_up(&lock->l_waitq);
1004 ldlm_lock2handle(lock, &lockh);
1005 ldlm_lock_decref(&lockh, LCK_PR);
1010 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
1012 struct ptlrpc_request *req = reqp;
1013 struct inode *inode = ll_inode_from_lock(lock);
1014 struct ll_inode_info *lli;
1015 struct lov_stripe_md *lsm;
1016 struct ost_lvb *lvb;
1018 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
1022 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
1023 lli = ll_i2info(inode);
1025 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1028 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1030 /* First, find out which stripe index this lock corresponds to. */
1031 stripe = ll_lock_to_stripe_offset(inode, lock);
1033 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1035 rc = lustre_pack_reply(req, 2, size, NULL);
1037 CERROR("lustre_pack_reply: %d\n", rc);
1041 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
1042 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
1043 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1044 lvb->lvb_atime = LTIME_S(inode->i_atime);
1045 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1047 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1048 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1049 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1050 lvb->lvb_atime, lvb->lvb_ctime);
1055 /* These errors are normal races, so we don't want to fill the console
1056 * with messages by calling ptlrpc_error() */
1057 if (rc == -ELDLM_NO_LOCK_DATA)
1058 lustre_pack_reply(req, 1, NULL, NULL);
1060 req->rq_status = rc;
1064 static void ll_merge_lvb(struct inode *inode)
1066 struct ll_inode_info *lli = ll_i2info(inode);
1067 struct ll_sb_info *sbi = ll_i2sbi(inode);
1071 ll_inode_size_lock(inode, 1);
1072 inode_init_lvb(inode, &lvb);
1073 obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1074 i_size_write(inode, lvb.lvb_size);
1075 inode->i_blocks = lvb.lvb_blocks;
1076 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1077 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1078 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1079 ll_inode_size_unlock(inode, 1);
1083 int ll_local_size(struct inode *inode)
1085 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1086 struct ll_inode_info *lli = ll_i2info(inode);
1087 struct ll_sb_info *sbi = ll_i2sbi(inode);
1088 struct lustre_handle lockh = { 0 };
1093 if (lli->lli_smd->lsm_stripe_count == 0)
1096 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1097 &policy, LCK_PR | LCK_PW, &flags, inode, &lockh);
1103 ll_merge_lvb(inode);
1104 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR | LCK_PW, &lockh);
1108 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1111 struct lustre_handle lockh = { 0 };
1112 struct ldlm_enqueue_info einfo = { 0 };
1113 struct obd_info oinfo = { { { 0 } } };
1119 einfo.ei_type = LDLM_EXTENT;
1120 einfo.ei_mode = LCK_PR;
1121 einfo.ei_cb_bl = ll_extent_lock_callback;
1122 einfo.ei_cb_cp = ldlm_completion_ast;
1123 einfo.ei_cb_gl = ll_glimpse_callback;
1124 einfo.ei_cbdata = NULL;
1126 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1127 oinfo.oi_lockh = &lockh;
1129 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1131 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1135 CERROR("obd_enqueue returned rc %d, "
1136 "returning -EIO\n", rc);
1137 RETURN(rc > 0 ? -EIO : rc);
1140 lov_stripe_lock(lsm);
1141 memset(&lvb, 0, sizeof(lvb));
1142 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1143 st->st_size = lvb.lvb_size;
1144 st->st_blocks = lvb.lvb_blocks;
1145 st->st_mtime = lvb.lvb_mtime;
1146 st->st_atime = lvb.lvb_atime;
1147 st->st_ctime = lvb.lvb_ctime;
1148 lov_stripe_unlock(lsm);
1153 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1154 * file (because it prefers KMS over RSS when larger) */
1155 int ll_glimpse_size(struct inode *inode, int ast_flags)
1157 struct ll_inode_info *lli = ll_i2info(inode);
1158 struct ll_sb_info *sbi = ll_i2sbi(inode);
1159 struct lustre_handle lockh = { 0 };
1160 struct ldlm_enqueue_info einfo = { 0 };
1161 struct obd_info oinfo = { { { 0 } } };
1165 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1168 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1170 if (!lli->lli_smd) {
1171 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1175 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1176 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1177 * won't revoke any conflicting DLM locks held. Instead,
1178 * ll_glimpse_callback() will be called on each client
1179 * holding a DLM lock against this file, and resulting size
1180 * will be returned for each stripe. DLM lock on [0, EOF] is
1181 * acquired only if there were no conflicting locks. */
1182 einfo.ei_type = LDLM_EXTENT;
1183 einfo.ei_mode = LCK_PR;
1184 einfo.ei_cb_bl = ll_extent_lock_callback;
1185 einfo.ei_cb_cp = ldlm_completion_ast;
1186 einfo.ei_cb_gl = ll_glimpse_callback;
1187 einfo.ei_cbdata = inode;
1189 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1190 oinfo.oi_lockh = &lockh;
1191 oinfo.oi_md = lli->lli_smd;
1192 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1194 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1198 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1199 RETURN(rc > 0 ? -EIO : rc);
1202 ll_merge_lvb(inode);
1204 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1205 i_size_read(inode), inode->i_blocks);
1210 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1211 struct lov_stripe_md *lsm, int mode,
1212 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1215 struct ll_sb_info *sbi = ll_i2sbi(inode);
1217 struct ldlm_enqueue_info einfo = { 0 };
1218 struct obd_info oinfo = { { { 0 } } };
1222 LASSERT(!lustre_handle_is_used(lockh));
1223 LASSERT(lsm != NULL);
1225 /* don't drop the mmapped file to LRU */
1226 if (mapping_mapped(inode->i_mapping))
1227 ast_flags |= LDLM_FL_NO_LRU;
1229 /* XXX phil: can we do this? won't it screw the file size up? */
1230 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1231 (sbi->ll_flags & LL_SBI_NOLCK))
1234 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1235 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1237 einfo.ei_type = LDLM_EXTENT;
1238 einfo.ei_mode = mode;
1239 einfo.ei_cb_bl = ll_extent_lock_callback;
1240 einfo.ei_cb_cp = ldlm_completion_ast;
1241 einfo.ei_cb_gl = ll_glimpse_callback;
1242 einfo.ei_cbdata = inode;
1244 oinfo.oi_policy = *policy;
1245 oinfo.oi_lockh = lockh;
1247 oinfo.oi_flags = ast_flags;
1249 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1250 *policy = oinfo.oi_policy;
1254 ll_inode_size_lock(inode, 1);
1255 inode_init_lvb(inode, &lvb);
1256 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1258 if (policy->l_extent.start == 0 &&
1259 policy->l_extent.end == OBD_OBJECT_EOF) {
1260 /* vmtruncate()->ll_truncate() first sets the i_size and then
1261 * the kms under both a DLM lock and the
1262 * ll_inode_size_lock(). If we don't get the
1263 * ll_inode_size_lock() here we can match the DLM lock and
1264 * reset i_size from the kms before the truncating path has
1265 * updated the kms. generic_file_write can then trust the
1266 * stale i_size when doing appending writes and effectively
1267 * cancel the result of the truncate. Getting the
1268 * ll_inode_size_lock() after the enqueue maintains the DLM
1269 * -> ll_inode_size_lock() acquiring order. */
1270 i_size_write(inode, lvb.lvb_size);
1271 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1272 inode->i_ino, i_size_read(inode));
1276 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1277 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1278 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1280 ll_inode_size_unlock(inode, 1);
1285 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1286 struct lov_stripe_md *lsm, int mode,
1287 struct lustre_handle *lockh)
1289 struct ll_sb_info *sbi = ll_i2sbi(inode);
1293 /* XXX phil: can we do this? won't it screw the file size up? */
1294 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1295 (sbi->ll_flags & LL_SBI_NOLCK))
1298 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1303 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1306 struct inode *inode = file->f_dentry->d_inode;
1307 struct ll_inode_info *lli = ll_i2info(inode);
1308 struct lov_stripe_md *lsm = lli->lli_smd;
1309 struct ll_sb_info *sbi = ll_i2sbi(inode);
1310 struct ll_lock_tree tree;
1311 struct ll_lock_tree_node *node;
1313 struct ll_ra_read bead;
1316 ssize_t retval, chunk, sum = 0;
1320 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1321 inode->i_ino, inode->i_generation, inode, count, *ppos);
1322 /* "If nbyte is 0, read() will return 0 and have no other results."
1323 * -- Single Unix Spec */
1327 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1330 /* Read on file with no objects should return zero-filled
1331 * buffers up to file size (we can get non-zero sizes with
1332 * mknod + truncate, then opening file for read. This is a
1333 * common pattern in NFS case, it seems). Bug 6243 */
1335 /* Since there are no objects on OSTs, we have nothing to get
1336 * lock on and so we are forced to access inode->i_size
1339 /* Read beyond end of file */
1340 if (*ppos >= i_size_read(inode))
1343 if (count > i_size_read(inode) - *ppos)
1344 count = i_size_read(inode) - *ppos;
1345 /* Make sure to correctly adjust the file pos pointer for
1347 notzeroed = clear_user(buf, count);
1356 if (sbi->ll_max_rw_chunk != 0) {
1357 /* first, let's know the end of the current stripe */
1359 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1362 /* correct, the end is beyond the request */
1363 if (end > *ppos + count - 1)
1364 end = *ppos + count - 1;
1366 /* and chunk shouldn't be too large even if striping is wide */
1367 if (end - *ppos > sbi->ll_max_rw_chunk)
1368 end = *ppos + sbi->ll_max_rw_chunk - 1;
1370 end = *ppos + count - 1;
1373 node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1375 GOTO(out, retval = PTR_ERR(node));
1378 tree.lt_fd = LUSTRE_FPRIVATE(file);
1379 rc = ll_tree_lock(&tree, node, buf, count,
1380 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1382 GOTO(out, retval = rc);
1384 ll_inode_size_lock(inode, 1);
1386 * Consistency guarantees: following possibilities exist for the
1387 * relation between region being read and real file size at this
1390 * (A): the region is completely inside of the file;
1392 * (B-x): x bytes of region are inside of the file, the rest is
1395 * (C): the region is completely outside of the file.
1397 * This classification is stable under DLM lock acquired by
1398 * ll_tree_lock() above, because to change class, other client has to
1399 * take DLM lock conflicting with our lock. Also, any updates to
1400 * ->i_size by other threads on this client are serialized by
1401 * ll_inode_size_lock(). This guarantees that short reads are handled
1402 * correctly in the face of concurrent writes and truncates.
1404 inode_init_lvb(inode, &lvb);
1405 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1407 if (*ppos + count - 1 > kms) {
1408 /* A glimpse is necessary to determine whether we return a
1409 * short read (B) or some zeroes at the end of the buffer (C) */
1410 ll_inode_size_unlock(inode, 1);
1411 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1413 ll_tree_unlock(&tree);
1417 /* region is within kms and, hence, within real file size (A).
1418 * We need to increase i_size to cover the read region so that
1419 * generic_file_read() will do its job, but that doesn't mean
1420 * the kms size is _correct_, it is only the _minimum_ size.
1421 * If someone does a stat they will get the correct size which
1422 * will always be >= the kms value here. b=11081 */
1423 if (i_size_read(inode) < kms)
1424 i_size_write(inode, kms);
1425 ll_inode_size_unlock(inode, 1);
1428 chunk = end - *ppos + 1;
1429 CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1430 inode->i_ino, chunk, *ppos, i_size_read(inode));
1432 /* turn off the kernel's read-ahead */
1433 file->f_ra.ra_pages = 0;
1435 /* initialize read-ahead window once per syscall */
1438 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1439 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1440 ll_ra_read_in(file, &bead);
1444 file_accessed(file);
1445 retval = generic_file_read(file, buf, chunk, ppos);
1446 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1448 ll_tree_unlock(&tree);
1454 if (retval == chunk && count > 0)
1460 ll_ra_read_ex(file, &bead);
1461 retval = (sum > 0) ? sum : retval;
1466 * Write to a file (through the page cache).
1468 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1471 struct inode *inode = file->f_dentry->d_inode;
1472 struct ll_sb_info *sbi = ll_i2sbi(inode);
1473 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1474 struct ll_lock_tree tree;
1475 struct ll_lock_tree_node *node;
1476 loff_t maxbytes = ll_file_maxbytes(inode);
1477 loff_t lock_start, lock_end, end;
1478 ssize_t retval, chunk, sum = 0;
1482 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1483 inode->i_ino, inode->i_generation, inode, count, *ppos);
1485 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1487 /* POSIX, but surprised the VFS doesn't check this already */
1491 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1492 * called on the file, don't fail the below assertion (bug 2388). */
1493 if (file->f_flags & O_LOV_DELAY_CREATE &&
1494 ll_i2info(inode)->lli_smd == NULL)
1497 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1499 down(&ll_i2info(inode)->lli_write_sem);
1502 chunk = 0; /* just to fix gcc's warning */
1503 end = *ppos + count - 1;
1505 if (file->f_flags & O_APPEND) {
1507 lock_end = OBD_OBJECT_EOF;
1508 } else if (sbi->ll_max_rw_chunk != 0) {
1509 /* first, let's know the end of the current stripe */
1511 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1514 /* correct, the end is beyond the request */
1515 if (end > *ppos + count - 1)
1516 end = *ppos + count - 1;
1518 /* and chunk shouldn't be too large even if striping is wide */
1519 if (end - *ppos > sbi->ll_max_rw_chunk)
1520 end = *ppos + sbi->ll_max_rw_chunk - 1;
1525 lock_end = *ppos + count - 1;
1527 node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1530 GOTO(out, retval = PTR_ERR(node));
1532 tree.lt_fd = LUSTRE_FPRIVATE(file);
1533 rc = ll_tree_lock(&tree, node, buf, count,
1534 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1536 GOTO(out, retval = rc);
1538 /* This is ok, g_f_w will overwrite this under i_sem if it races
1539 * with a local truncate, it just makes our maxbyte checking easier.
1540 * The i_size value gets updated in ll_extent_lock() as a consequence
1541 * of the [0,EOF] extent lock we requested above. */
1542 if (file->f_flags & O_APPEND) {
1543 *ppos = i_size_read(inode);
1544 end = *ppos + count - 1;
1547 if (*ppos >= maxbytes) {
1548 send_sig(SIGXFSZ, current, 0);
1549 GOTO(out_unlock, retval = -EFBIG);
1551 if (*ppos + count > maxbytes)
1552 count = maxbytes - *ppos;
1554 /* generic_file_write handles O_APPEND after getting i_mutex */
1555 chunk = end - *ppos + 1;
1556 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1557 inode->i_ino, chunk, *ppos);
1558 retval = generic_file_write(file, buf, chunk, ppos);
1559 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1562 ll_tree_unlock(&tree);
1569 if (retval == chunk && count > 0)
1573 up(&ll_i2info(inode)->lli_write_sem);
1575 retval = (sum > 0) ? sum : retval;
1576 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1577 retval > 0 ? retval : 0);
1582 * Send file content (through pagecache) somewhere with helper
1584 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1585 read_actor_t actor, void *target)
1587 struct inode *inode = in_file->f_dentry->d_inode;
1588 struct ll_inode_info *lli = ll_i2info(inode);
1589 struct lov_stripe_md *lsm = lli->lli_smd;
1590 struct ll_lock_tree tree;
1591 struct ll_lock_tree_node *node;
1593 struct ll_ra_read bead;
1598 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1599 inode->i_ino, inode->i_generation, inode, count, *ppos);
1601 /* "If nbyte is 0, read() will return 0 and have no other results."
1602 * -- Single Unix Spec */
1606 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1607 /* turn off the kernel's read-ahead */
1608 in_file->f_ra.ra_pages = 0;
1610 /* File with no objects, nothing to lock */
1612 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1614 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1616 RETURN(PTR_ERR(node));
1618 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1619 rc = ll_tree_lock(&tree, node, NULL, count,
1620 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1624 ll_inode_size_lock(inode, 1);
1626 * Consistency guarantees: following possibilities exist for the
1627 * relation between region being read and real file size at this
1630 * (A): the region is completely inside of the file;
1632 * (B-x): x bytes of region are inside of the file, the rest is
1635 * (C): the region is completely outside of the file.
1637 * This classification is stable under DLM lock acquired by
1638 * ll_tree_lock() above, because to change class, other client has to
1639 * take DLM lock conflicting with our lock. Also, any updates to
1640 * ->i_size by other threads on this client are serialized by
1641 * ll_inode_size_lock(). This guarantees that short reads are handled
1642 * correctly in the face of concurrent writes and truncates.
1644 inode_init_lvb(inode, &lvb);
1645 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1647 if (*ppos + count - 1 > kms) {
1648 /* A glimpse is necessary to determine whether we return a
1649 * short read (B) or some zeroes at the end of the buffer (C) */
1650 ll_inode_size_unlock(inode, 1);
1651 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1655 /* region is within kms and, hence, within real file size (A) */
1656 i_size_write(inode, kms);
1657 ll_inode_size_unlock(inode, 1);
1660 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1661 inode->i_ino, count, *ppos, i_size_read(inode));
1663 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1664 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1665 ll_ra_read_in(in_file, &bead);
1667 file_accessed(in_file);
1668 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1669 ll_ra_read_ex(in_file, &bead);
1672 ll_tree_unlock(&tree);
1676 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1679 struct ll_inode_info *lli = ll_i2info(inode);
1680 struct obd_export *exp = ll_i2dtexp(inode);
1681 struct ll_recreate_obj ucreatp;
1682 struct obd_trans_info oti = { 0 };
1683 struct obdo *oa = NULL;
1686 struct lov_stripe_md *lsm, *lsm2;
1689 if (!capable (CAP_SYS_ADMIN))
1692 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1693 sizeof(struct ll_recreate_obj));
1701 down(&lli->lli_size_sem);
1704 GOTO(out, rc = -ENOENT);
1705 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1706 (lsm->lsm_stripe_count));
1708 OBD_ALLOC(lsm2, lsm_size);
1710 GOTO(out, rc = -ENOMEM);
1712 oa->o_id = ucreatp.lrc_id;
1713 oa->o_gr = ucreatp.lrc_group;
1714 oa->o_nlink = ucreatp.lrc_ost_idx;
1715 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1716 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1717 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1718 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1720 oti.oti_objid = NULL;
1721 memcpy(lsm2, lsm, lsm_size);
1722 rc = obd_create(exp, oa, &lsm2, &oti);
1724 OBD_FREE(lsm2, lsm_size);
1727 up(&lli->lli_size_sem);
1732 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1733 int flags, struct lov_user_md *lum, int lum_size)
1735 struct ll_inode_info *lli = ll_i2info(inode);
1736 struct lov_stripe_md *lsm;
1737 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1741 down(&lli->lli_size_sem);
1744 up(&lli->lli_size_sem);
1745 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1750 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1753 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1754 GOTO(out_req_free, rc = -ENOENT);
1755 rc = oit.d.lustre.it_status;
1757 GOTO(out_req_free, rc);
1759 ll_release_openhandle(file->f_dentry, &oit);
1762 up(&lli->lli_size_sem);
1763 ll_intent_release(&oit);
1766 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1770 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1771 struct lov_mds_md **lmmp, int *lmm_size,
1772 struct ptlrpc_request **request)
1774 struct ll_sb_info *sbi = ll_i2sbi(inode);
1775 struct mdt_body *body;
1776 struct lov_mds_md *lmm = NULL;
1777 struct ptlrpc_request *req = NULL;
1778 struct obd_capa *oc;
1781 rc = ll_get_max_mdsize(sbi, &lmmsize);
1785 oc = ll_mdscapa_get(inode);
1786 rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1787 oc, filename, strlen(filename) + 1,
1788 OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize, &req);
1791 CDEBUG(D_INFO, "md_getattr_name failed "
1792 "on %s: rc %d\n", filename, rc);
1796 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body));
1797 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1798 /* swabbed by mdc_getattr_name */
1799 LASSERT_REPSWABBED(req, REPLY_REC_OFF);
1801 lmmsize = body->eadatasize;
1803 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1805 GOTO(out, rc = -ENODATA);
1808 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1809 LASSERT(lmm != NULL);
1810 LASSERT_REPSWABBED(req, REPLY_REC_OFF + 1);
1813 * This is coming from the MDS, so is probably in
1814 * little endian. We convert it to host endian before
1815 * passing it to userspace.
1817 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1818 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1819 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1820 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1821 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1824 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1825 struct lov_stripe_md *lsm;
1826 struct lov_user_md_join *lmj;
1827 int lmj_size, i, aindex = 0;
1829 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1831 GOTO(out, rc = -ENOMEM);
1832 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1834 GOTO(out_free_memmd, rc);
1836 lmj_size = sizeof(struct lov_user_md_join) +
1837 lsm->lsm_stripe_count *
1838 sizeof(struct lov_user_ost_data_join);
1839 OBD_ALLOC(lmj, lmj_size);
1841 GOTO(out_free_memmd, rc = -ENOMEM);
1843 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1844 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1845 struct lov_extent *lex =
1846 &lsm->lsm_array->lai_ext_array[aindex];
1848 if (lex->le_loi_idx + lex->le_stripe_count <= i)
1850 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1851 LPU64" len %d\n", aindex, i,
1852 lex->le_start, (int)lex->le_len);
1853 lmj->lmm_objects[i].l_extent_start =
1856 if ((int)lex->le_len == -1)
1857 lmj->lmm_objects[i].l_extent_end = -1;
1859 lmj->lmm_objects[i].l_extent_end =
1860 lex->le_start + lex->le_len;
1861 lmj->lmm_objects[i].l_object_id =
1862 lsm->lsm_oinfo[i]->loi_id;
1863 lmj->lmm_objects[i].l_object_gr =
1864 lsm->lsm_oinfo[i]->loi_gr;
1865 lmj->lmm_objects[i].l_ost_gen =
1866 lsm->lsm_oinfo[i]->loi_ost_gen;
1867 lmj->lmm_objects[i].l_ost_idx =
1868 lsm->lsm_oinfo[i]->loi_ost_idx;
1870 lmm = (struct lov_mds_md *)lmj;
1873 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1877 *lmm_size = lmmsize;
1882 static int ll_lov_setea(struct inode *inode, struct file *file,
1885 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1886 struct lov_user_md *lump;
1887 int lum_size = sizeof(struct lov_user_md) +
1888 sizeof(struct lov_user_ost_data);
1892 if (!capable (CAP_SYS_ADMIN))
1895 OBD_ALLOC(lump, lum_size);
1899 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
1901 OBD_FREE(lump, lum_size);
1905 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1907 OBD_FREE(lump, lum_size);
1911 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1914 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1916 int flags = FMODE_WRITE;
1919 /* Bug 1152: copy properly when this is no longer true */
1920 LASSERT(sizeof(lum) == sizeof(*lump));
1921 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1922 rc = copy_from_user(&lum, lump, sizeof(lum));
1926 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1928 put_user(0, &lump->lmm_stripe_count);
1929 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1930 0, ll_i2info(inode)->lli_smd, lump);
1935 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1937 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1942 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1946 static int ll_get_grouplock(struct inode *inode, struct file *file,
1949 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1950 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1951 .end = OBD_OBJECT_EOF}};
1952 struct lustre_handle lockh = { 0 };
1953 struct ll_inode_info *lli = ll_i2info(inode);
1954 struct lov_stripe_md *lsm = lli->lli_smd;
1958 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1962 policy.l_extent.gid = arg;
1963 if (file->f_flags & O_NONBLOCK)
1964 flags = LDLM_FL_BLOCK_NOWAIT;
1966 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1970 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1972 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1977 static int ll_put_grouplock(struct inode *inode, struct file *file,
1980 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1981 struct ll_inode_info *lli = ll_i2info(inode);
1982 struct lov_stripe_md *lsm = lli->lli_smd;
1986 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1987 /* Ugh, it's already unlocked. */
1991 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1994 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1996 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2001 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2006 static int join_sanity_check(struct inode *head, struct inode *tail)
2009 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2010 CERROR("server do not support join \n");
2013 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2014 CERROR("tail ino %lu and ino head %lu must be regular\n",
2015 head->i_ino, tail->i_ino);
2018 if (head->i_ino == tail->i_ino) {
2019 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2022 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2023 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2029 static int join_file(struct inode *head_inode, struct file *head_filp,
2030 struct file *tail_filp)
2032 struct dentry *tail_dentry = tail_filp->f_dentry;
2033 struct lookup_intent oit = {.it_op = IT_OPEN,
2034 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2035 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2036 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL };
2038 struct lustre_handle lockh;
2039 struct md_op_data *op_data;
2044 tail_dentry = tail_filp->f_dentry;
2046 data = i_size_read(head_inode);
2047 op_data = ll_prep_md_op_data(NULL, head_inode,
2048 tail_dentry->d_parent->d_inode,
2049 tail_dentry->d_name.name,
2050 tail_dentry->d_name.len, 0,
2051 LUSTRE_OPC_ANY, &data);
2052 if (IS_ERR(op_data))
2053 RETURN(PTR_ERR(op_data));
2055 rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2056 op_data, &lockh, NULL, 0, 0);
2058 ll_finish_md_op_data(op_data);
2062 rc = oit.d.lustre.it_status;
2064 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2065 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2066 ptlrpc_req_finished((struct ptlrpc_request *)
2067 oit.d.lustre.it_data);
2071 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2073 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2074 oit.d.lustre.it_lock_mode = 0;
2076 ll_release_openhandle(head_filp->f_dentry, &oit);
2078 ll_intent_release(&oit);
2082 static int ll_file_join(struct inode *head, struct file *filp,
2083 char *filename_tail)
2085 struct inode *tail = NULL, *first = NULL, *second = NULL;
2086 struct dentry *tail_dentry;
2087 struct file *tail_filp, *first_filp, *second_filp;
2088 struct ll_lock_tree first_tree, second_tree;
2089 struct ll_lock_tree_node *first_node, *second_node;
2090 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2091 int rc = 0, cleanup_phase = 0;
2094 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2095 head->i_ino, head->i_generation, head, filename_tail);
2097 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2098 if (IS_ERR(tail_filp)) {
2099 CERROR("Can not open tail file %s", filename_tail);
2100 rc = PTR_ERR(tail_filp);
2103 tail = igrab(tail_filp->f_dentry->d_inode);
2105 tlli = ll_i2info(tail);
2106 tail_dentry = tail_filp->f_dentry;
2107 LASSERT(tail_dentry);
2110 /*reorder the inode for lock sequence*/
2111 first = head->i_ino > tail->i_ino ? head : tail;
2112 second = head->i_ino > tail->i_ino ? tail : head;
2113 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2114 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2116 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2117 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2118 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2119 if (IS_ERR(first_node)){
2120 rc = PTR_ERR(first_node);
2123 first_tree.lt_fd = first_filp->private_data;
2124 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2129 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2130 if (IS_ERR(second_node)){
2131 rc = PTR_ERR(second_node);
2134 second_tree.lt_fd = second_filp->private_data;
2135 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2140 rc = join_sanity_check(head, tail);
2144 rc = join_file(head, filp, tail_filp);
2148 switch (cleanup_phase) {
2150 ll_tree_unlock(&second_tree);
2151 obd_cancel_unused(ll_i2dtexp(second),
2152 ll_i2info(second)->lli_smd, 0, NULL);
2154 ll_tree_unlock(&first_tree);
2155 obd_cancel_unused(ll_i2dtexp(first),
2156 ll_i2info(first)->lli_smd, 0, NULL);
2158 filp_close(tail_filp, 0);
2161 if (head && rc == 0) {
2162 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2164 hlli->lli_smd = NULL;
2169 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2175 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2177 struct inode *inode = dentry->d_inode;
2178 struct obd_client_handle *och;
2184 /* Root ? Do nothing. */
2185 if (dentry->d_inode->i_sb->s_root == dentry)
2188 /* No open handle to close? Move away */
2189 if (!it_disposition(it, DISP_OPEN_OPEN))
2192 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2194 OBD_ALLOC(och, sizeof(*och));
2196 GOTO(out, rc = -ENOMEM);
2198 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2199 ll_i2info(inode), it, och);
2201 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2204 /* this one is in place of ll_file_open */
2205 ptlrpc_req_finished(it->d.lustre.it_data);
2206 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2210 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2213 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2217 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2218 inode->i_generation, inode, cmd);
2219 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2221 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2222 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2226 case LL_IOC_GETFLAGS:
2227 /* Get the current value of the file flags */
2228 return put_user(fd->fd_flags, (int *)arg);
2229 case LL_IOC_SETFLAGS:
2230 case LL_IOC_CLRFLAGS:
2231 /* Set or clear specific file flags */
2232 /* XXX This probably needs checks to ensure the flags are
2233 * not abused, and to handle any flag side effects.
2235 if (get_user(flags, (int *) arg))
2238 if (cmd == LL_IOC_SETFLAGS) {
2239 if ((flags & LL_FILE_IGNORE_LOCK) &&
2240 !(file->f_flags & O_DIRECT)) {
2241 CERROR("%s: unable to disable locking on "
2242 "non-O_DIRECT file\n", current->comm);
2246 fd->fd_flags |= flags;
2248 fd->fd_flags &= ~flags;
2251 case LL_IOC_LOV_SETSTRIPE:
2252 RETURN(ll_lov_setstripe(inode, file, arg));
2253 case LL_IOC_LOV_SETEA:
2254 RETURN(ll_lov_setea(inode, file, arg));
2255 case LL_IOC_LOV_GETSTRIPE:
2256 RETURN(ll_lov_getstripe(inode, arg));
2257 case LL_IOC_RECREATE_OBJ:
2258 RETURN(ll_lov_recreate_obj(inode, file, arg));
2259 case EXT3_IOC_GETFLAGS:
2260 case EXT3_IOC_SETFLAGS:
2261 RETURN(ll_iocontrol(inode, file, cmd, arg));
2262 case EXT3_IOC_GETVERSION_OLD:
2263 case EXT3_IOC_GETVERSION:
2264 RETURN(put_user(inode->i_generation, (int *)arg));
2269 ftail = getname((const char *)arg);
2271 RETURN(PTR_ERR(ftail));
2272 rc = ll_file_join(inode, file, ftail);
2276 case LL_IOC_GROUP_LOCK:
2277 RETURN(ll_get_grouplock(inode, file, arg));
2278 case LL_IOC_GROUP_UNLOCK:
2279 RETURN(ll_put_grouplock(inode, file, arg));
2280 case IOC_OBD_STATFS:
2281 RETURN(ll_obd_statfs(inode, (void *)arg));
2283 /* We need to special case any other ioctls we want to handle,
2284 * to send them to the MDS/OST as appropriate and to properly
2285 * network encode the arg field.
2286 case EXT3_IOC_SETVERSION_OLD:
2287 case EXT3_IOC_SETVERSION:
2289 case LL_IOC_FLUSHCTX:
2290 RETURN(ll_flush_ctx(inode));
2291 case LL_IOC_GETFACL: {
2292 struct rmtacl_ioctl_data ioc;
2294 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2297 RETURN(ll_ioctl_getfacl(inode, &ioc));
2299 case LL_IOC_SETFACL: {
2300 struct rmtacl_ioctl_data ioc;
2302 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2305 RETURN(ll_ioctl_setfacl(inode, &ioc));
2311 ll_iocontrol_call(inode, file, cmd, arg, &err))
2314 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2320 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2322 struct inode *inode = file->f_dentry->d_inode;
2323 struct ll_inode_info *lli = ll_i2info(inode);
2324 struct lov_stripe_md *lsm = lli->lli_smd;
2327 retval = offset + ((origin == 2) ? i_size_read(inode) :
2328 (origin == 1) ? file->f_pos : 0);
2329 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2330 inode->i_ino, inode->i_generation, inode, retval, retval,
2331 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2332 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2334 if (origin == 2) { /* SEEK_END */
2335 int nonblock = 0, rc;
2337 if (file->f_flags & O_NONBLOCK)
2338 nonblock = LDLM_FL_BLOCK_NOWAIT;
2341 rc = ll_glimpse_size(inode, nonblock);
2346 ll_inode_size_lock(inode, 0);
2347 offset += i_size_read(inode);
2348 ll_inode_size_unlock(inode, 0);
2349 } else if (origin == 1) { /* SEEK_CUR */
2350 offset += file->f_pos;
2354 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2355 if (offset != file->f_pos) {
2356 file->f_pos = offset;
2357 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2359 file->f_version = ++event;
2368 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2370 struct inode *inode = dentry->d_inode;
2371 struct ll_inode_info *lli = ll_i2info(inode);
2372 struct lov_stripe_md *lsm = lli->lli_smd;
2373 struct ptlrpc_request *req;
2374 struct obd_capa *oc;
2377 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2378 inode->i_generation, inode);
2379 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2381 /* fsync's caller has already called _fdata{sync,write}, we want
2382 * that IO to finish before calling the osc and mdc sync methods */
2383 rc = filemap_fdatawait(inode->i_mapping);
2385 /* catch async errors that were recorded back when async writeback
2386 * failed for pages in this mapping. */
2387 err = lli->lli_async_rc;
2388 lli->lli_async_rc = 0;
2392 err = lov_test_and_clear_async_rc(lsm);
2397 oc = ll_mdscapa_get(inode);
2398 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2404 ptlrpc_req_finished(req);
2411 RETURN(rc ? rc : -ENOMEM);
2413 oa->o_id = lsm->lsm_object_id;
2414 oa->o_gr = lsm->lsm_object_gr;
2415 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2416 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2417 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2420 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2421 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2422 0, OBD_OBJECT_EOF, oc);
2432 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2434 struct inode *inode = file->f_dentry->d_inode;
2435 struct ll_sb_info *sbi = ll_i2sbi(inode);
2436 struct ldlm_res_id res_id =
2437 { .name = { fid_seq(ll_inode2fid(inode)),
2438 fid_oid(ll_inode2fid(inode)),
2439 fid_ver(ll_inode2fid(inode)),
2441 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2442 ldlm_flock_completion_ast, NULL, file_lock };
2443 struct lustre_handle lockh = {0};
2444 ldlm_policy_data_t flock;
2449 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2450 inode->i_ino, file_lock);
2452 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2454 if (file_lock->fl_flags & FL_FLOCK) {
2455 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2456 /* set missing params for flock() calls */
2457 file_lock->fl_end = OFFSET_MAX;
2458 file_lock->fl_pid = current->tgid;
2460 flock.l_flock.pid = file_lock->fl_pid;
2461 flock.l_flock.start = file_lock->fl_start;
2462 flock.l_flock.end = file_lock->fl_end;
2464 switch (file_lock->fl_type) {
2466 einfo.ei_mode = LCK_PR;
2469 /* An unlock request may or may not have any relation to
2470 * existing locks so we may not be able to pass a lock handle
2471 * via a normal ldlm_lock_cancel() request. The request may even
2472 * unlock a byte range in the middle of an existing lock. In
2473 * order to process an unlock request we need all of the same
2474 * information that is given with a normal read or write record
2475 * lock request. To avoid creating another ldlm unlock (cancel)
2476 * message we'll treat a LCK_NL flock request as an unlock. */
2477 einfo.ei_mode = LCK_NL;
2480 einfo.ei_mode = LCK_PW;
2483 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2498 flags = LDLM_FL_BLOCK_NOWAIT;
2504 flags = LDLM_FL_TEST_LOCK;
2505 /* Save the old mode so that if the mode in the lock changes we
2506 * can decrement the appropriate reader or writer refcount. */
2507 file_lock->fl_type = einfo.ei_mode;
2510 CERROR("unknown fcntl lock command: %d\n", cmd);
2514 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2515 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2516 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2518 rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &einfo, &res_id,
2519 &flock, &flags, NULL, 0, NULL, &lockh, 0);
2520 if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2521 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2522 #ifdef HAVE_F_OP_FLOCK
2523 if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2524 !(flags & LDLM_FL_TEST_LOCK))
2525 posix_lock_file_wait(file, file_lock);
2531 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2538 int ll_have_md_lock(struct inode *inode, __u64 bits)
2540 struct lustre_handle lockh;
2541 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2549 fid = &ll_i2info(inode)->lli_fid;
2550 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2552 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2553 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2554 LCK_CR|LCK_CW|LCK_PR, &lockh)) {
2561 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2562 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2563 * and return success */
2565 /* This path cannot be hit for regular files unless in
2566 * case of obscure races, so no need to to validate
2568 if (!S_ISREG(inode->i_mode) &&
2569 !S_ISDIR(inode->i_mode))
2574 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2582 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2584 struct inode *inode = dentry->d_inode;
2585 struct ptlrpc_request *req = NULL;
2586 struct ll_sb_info *sbi;
2587 struct obd_export *exp;
2592 CERROR("REPORT THIS LINE TO PETER\n");
2595 sbi = ll_i2sbi(inode);
2597 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2598 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2600 exp = ll_i2mdexp(inode);
2602 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2603 struct lookup_intent oit = { .it_op = IT_GETATTR };
2604 struct md_op_data *op_data;
2606 /* Call getattr by fid, so do not provide name at all. */
2607 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2608 dentry->d_inode, NULL, 0, 0,
2609 LUSTRE_OPC_ANY, NULL);
2610 if (IS_ERR(op_data))
2611 RETURN(PTR_ERR(op_data));
2613 oit.it_flags |= O_CHECK_STALE;
2614 rc = md_intent_lock(exp, op_data, NULL, 0,
2615 /* we are not interested in name
2618 ll_md_blocking_ast, 0);
2619 ll_finish_md_op_data(op_data);
2620 oit.it_flags &= ~O_CHECK_STALE;
2622 rc = ll_inode_revalidate_fini(inode, rc);
2626 rc = ll_revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2628 ll_intent_release(&oit);
2632 /* Unlinked? Unhash dentry, so it is not picked up later by
2633 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2634 here to preserve get_cwd functionality on 2.6.
2636 if (!dentry->d_inode->i_nlink) {
2637 spin_lock(&dcache_lock);
2638 ll_drop_dentry(dentry);
2639 spin_unlock(&dcache_lock);
2642 ll_lookup_finish_locks(&oit, dentry);
2643 } else if (!ll_have_md_lock(dentry->d_inode,
2644 MDS_INODELOCK_UPDATE)) {
2645 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2646 obd_valid valid = OBD_MD_FLGETATTR;
2647 struct obd_capa *oc;
2650 if (S_ISREG(inode->i_mode)) {
2651 rc = ll_get_max_mdsize(sbi, &ealen);
2654 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2656 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2657 * capa for this inode. Because we only keep capas of dirs
2659 oc = ll_mdscapa_get(inode);
2660 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2664 rc = ll_inode_revalidate_fini(inode, rc);
2668 rc = ll_prep_inode(&inode, req, REPLY_REC_OFF,
2674 /* if object not yet allocated, don't validate size */
2675 if (ll_i2info(inode)->lli_smd == NULL)
2678 /* ll_glimpse_size will prefer locally cached writes if they extend
2680 rc = ll_glimpse_size(inode, 0);
2683 ptlrpc_req_finished(req);
2687 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2688 struct lookup_intent *it, struct kstat *stat)
2690 struct inode *inode = de->d_inode;
2693 res = ll_inode_revalidate_it(de, it);
2694 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2699 stat->dev = inode->i_sb->s_dev;
2700 stat->ino = inode->i_ino;
2701 stat->mode = inode->i_mode;
2702 stat->nlink = inode->i_nlink;
2703 stat->uid = inode->i_uid;
2704 stat->gid = inode->i_gid;
2705 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2706 stat->atime = inode->i_atime;
2707 stat->mtime = inode->i_mtime;
2708 stat->ctime = inode->i_ctime;
2709 #ifdef HAVE_INODE_BLKSIZE
2710 stat->blksize = inode->i_blksize;
2712 stat->blksize = 1 << inode->i_blkbits;
2715 ll_inode_size_lock(inode, 0);
2716 stat->size = i_size_read(inode);
2717 stat->blocks = inode->i_blocks;
2718 ll_inode_size_unlock(inode, 0);
2722 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2724 struct lookup_intent it = { .it_op = IT_GETATTR };
2726 return ll_getattr_it(mnt, de, &it, stat);
2730 int lustre_check_acl(struct inode *inode, int mask)
2732 #ifdef CONFIG_FS_POSIX_ACL
2733 struct ll_inode_info *lli = ll_i2info(inode);
2734 struct posix_acl *acl;
2738 spin_lock(&lli->lli_lock);
2739 acl = posix_acl_dup(lli->lli_posix_acl);
2740 spin_unlock(&lli->lli_lock);
2745 rc = posix_acl_permission(inode, acl, mask);
2746 posix_acl_release(acl);
2754 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2755 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2757 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2758 inode->i_ino, inode->i_generation, inode, mask);
2759 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2760 return lustre_check_remote_perm(inode, mask);
2762 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2763 return generic_permission(inode, mask, lustre_check_acl);
2766 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2768 int mode = inode->i_mode;
2771 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2772 inode->i_ino, inode->i_generation, inode, mask);
2774 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2775 return lustre_check_remote_perm(inode, mask);
2777 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2779 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2780 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2782 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2784 if (current->fsuid == inode->i_uid) {
2787 if (((mode >> 3) & mask & S_IRWXO) != mask)
2789 rc = lustre_check_acl(inode, mask);
2793 goto check_capabilities;
2797 if (in_group_p(inode->i_gid))
2800 if ((mode & mask & S_IRWXO) == mask)
2804 if (!(mask & MAY_EXEC) ||
2805 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2806 if (capable(CAP_DAC_OVERRIDE))
2809 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2810 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2817 /* -o localflock - only provides locally consistent flock locks */
2818 struct file_operations ll_file_operations = {
2819 .read = ll_file_read,
2820 .write = ll_file_write,
2821 .ioctl = ll_file_ioctl,
2822 .open = ll_file_open,
2823 .release = ll_file_release,
2824 .mmap = ll_file_mmap,
2825 .llseek = ll_file_seek,
2826 .sendfile = ll_file_sendfile,
2830 struct file_operations ll_file_operations_flock = {
2831 .read = ll_file_read,
2832 .write = ll_file_write,
2833 .ioctl = ll_file_ioctl,
2834 .open = ll_file_open,
2835 .release = ll_file_release,
2836 .mmap = ll_file_mmap,
2837 .llseek = ll_file_seek,
2838 .sendfile = ll_file_sendfile,
2840 #ifdef HAVE_F_OP_FLOCK
2841 .flock = ll_file_flock,
2843 .lock = ll_file_flock
2846 /* These are for -o noflock - to return ENOSYS on flock calls */
2847 struct file_operations ll_file_operations_noflock = {
2848 .read = ll_file_read,
2849 .write = ll_file_write,
2850 .ioctl = ll_file_ioctl,
2851 .open = ll_file_open,
2852 .release = ll_file_release,
2853 .mmap = ll_file_mmap,
2854 .llseek = ll_file_seek,
2855 .sendfile = ll_file_sendfile,
2857 #ifdef HAVE_F_OP_FLOCK
2858 .flock = ll_file_noflock,
2860 .lock = ll_file_noflock
2863 struct inode_operations ll_file_inode_operations = {
2864 #ifdef LUSTRE_KERNEL_VERSION
2865 .setattr_raw = ll_setattr_raw,
2867 .setattr = ll_setattr,
2868 .truncate = ll_truncate,
2869 .getattr = ll_getattr,
2870 .permission = ll_inode_permission,
2871 .setxattr = ll_setxattr,
2872 .getxattr = ll_getxattr,
2873 .listxattr = ll_listxattr,
2874 .removexattr = ll_removexattr,
2877 /* dynamic ioctl number support routins */
2878 static struct llioc_ctl_data {
2879 struct rw_semaphore ioc_sem;
2880 struct list_head ioc_head;
2882 __RWSEM_INITIALIZER(llioc.ioc_sem),
2883 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2888 struct list_head iocd_list;
2889 unsigned int iocd_size;
2890 llioc_callback_t iocd_cb;
2891 unsigned int iocd_count;
2892 unsigned int iocd_cmd[0];
2895 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2898 struct llioc_data *in_data = NULL;
2901 if (cb == NULL || cmd == NULL ||
2902 count > LLIOC_MAX_CMD || count < 0)
2905 size = sizeof(*in_data) + count * sizeof(unsigned int);
2906 OBD_ALLOC(in_data, size);
2907 if (in_data == NULL)
2910 memset(in_data, 0, sizeof(*in_data));
2911 in_data->iocd_size = size;
2912 in_data->iocd_cb = cb;
2913 in_data->iocd_count = count;
2914 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2916 down_write(&llioc.ioc_sem);
2917 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2918 up_write(&llioc.ioc_sem);
2923 void ll_iocontrol_unregister(void *magic)
2925 struct llioc_data *tmp;
2930 down_write(&llioc.ioc_sem);
2931 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2933 unsigned int size = tmp->iocd_size;
2935 list_del(&tmp->iocd_list);
2936 up_write(&llioc.ioc_sem);
2938 OBD_FREE(tmp, size);
2942 up_write(&llioc.ioc_sem);
2944 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2947 EXPORT_SYMBOL(ll_iocontrol_register);
2948 EXPORT_SYMBOL(ll_iocontrol_unregister);
2950 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2951 unsigned int cmd, unsigned long arg, int *rcp)
2953 enum llioc_iter ret = LLIOC_CONT;
2954 struct llioc_data *data;
2955 int rc = -EINVAL, i;
2957 down_read(&llioc.ioc_sem);
2958 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2959 for (i = 0; i < data->iocd_count; i++) {
2960 if (cmd != data->iocd_cmd[i])
2963 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2967 if (ret == LLIOC_STOP)
2970 up_read(&llioc.ioc_sem);