1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Directory code for lustre client.
42 #include <linux/pagemap.h>
44 #include <linux/version.h>
45 #include <linux/smp_lock.h>
46 #include <asm/uaccess.h>
47 #include <linux/buffer_head.h> // for wait_on_buffer
49 #define DEBUG_SUBSYSTEM S_LLITE
51 #include <obd_support.h>
52 #include <obd_class.h>
53 #include <lustre_lib.h>
54 #include <lustre/lustre_idl.h>
55 #include <lustre_lite.h>
56 #include <lustre_dlm.h>
57 #include "llite_internal.h"
59 #ifndef HAVE_PAGE_CHECKED
60 #ifdef HAVE_PG_FS_MISC
61 #define PageChecked(page) test_bit(PG_fs_misc, &(page)->flags)
62 #define SetPageChecked(page) set_bit(PG_fs_misc, &(page)->flags)
64 #error PageChecked or PageFsMisc not defined in kernel
68 /* returns the page unlocked, but with a reference */
69 static int ll_dir_readpage(struct file *file, struct page *page)
71 struct inode *inode = page->mapping->host;
72 struct ll_fid mdc_fid;
74 struct ptlrpc_request *request;
75 struct mds_body *body;
79 offset = (__u64)page->index << CFS_PAGE_SHIFT;
80 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off "LPU64"\n",
81 inode->i_ino, inode->i_generation, inode, offset);
83 ll_pack_fid(&mdc_fid, inode->i_ino, inode->i_generation, S_IFDIR);
85 rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &mdc_fid,
86 offset, page, &request);
88 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
90 LASSERT(body != NULL); /* checked by mdc_readpage() */
91 /* swabbed by mdc_readpage() */
92 LASSERT(lustre_rep_swabbed(request, REPLY_REC_OFF));
94 if (body->size != i_size_read(inode)) {
95 ll_inode_size_lock(inode, 0);
96 i_size_write(inode, body->size);
97 ll_inode_size_unlock(inode, 0);
100 SetPageUptodate(page);
102 ptlrpc_req_finished(request);
109 struct address_space_operations ll_dir_aops = {
110 .readpage = ll_dir_readpage,
113 static inline unsigned ll_dir_page_mask(struct inode *inode)
115 return ~(inode->i_sb->s_blocksize - 1);
119 * Check consistency of a single entry.
121 static int ll_dir_check_entry(struct inode *dir, struct ll_dir_entry *ent,
122 unsigned offset, unsigned rec_len, pgoff_t index)
127 * Consider adding more checks.
130 if (unlikely(rec_len < ll_dir_rec_len(1)))
131 msg = "entry is too short";
132 else if (unlikely(rec_len & 3))
133 msg = "wrong alignment";
134 else if (unlikely(rec_len < ll_dir_rec_len(ent->lde_name_len)))
135 msg = "rec_len doesn't match name_len";
136 else if (unlikely(((offset + rec_len - 1) ^ offset) &
137 ll_dir_page_mask(dir)))
138 msg = "directory entry across blocks";
141 CERROR("%s: bad entry in directory %lu/%u: %s - "
142 "offset=%lu+%u, inode=%lu, rec_len=%d,"
143 " name_len=%d\n", ll_i2mdcexp(dir)->exp_obd->obd_name,
144 dir->i_ino, dir->i_generation, msg,
145 index << CFS_PAGE_SHIFT,
146 offset, (unsigned long)le32_to_cpu(ent->lde_inode),
147 rec_len, ent->lde_name_len);
151 static void ll_dir_check_page(struct inode *dir, struct page *page)
154 unsigned size = dir->i_sb->s_blocksize;
155 char *addr = page_address(page);
160 struct ll_dir_entry *ent;
163 if ((i_size_read(dir) >> CFS_PAGE_SHIFT) == (__u64)page->index) {
167 limit = i_size_read(dir) & ~CFS_PAGE_MASK;
168 if (limit & (size - 1)) {
169 CERROR("%s: dir %lu/%u size %llu doesn't match %u\n",
170 ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino,
171 dir->i_generation, i_size_read(dir), size);
175 * Place dummy forwarding entries to streamline
178 for (off = limit; off < CFS_PAGE_SIZE; off += size) {
179 ent = ll_entry_at(addr, off);
180 ent->lde_rec_len = cpu_to_le16(size);
181 ent->lde_name_len = 0;
186 limit = CFS_PAGE_SIZE;
189 !err && off <= limit - ll_dir_rec_len(1); off += reclen) {
190 ent = ll_entry_at(addr, off);
191 reclen = le16_to_cpu(ent->lde_rec_len);
192 err = ll_dir_check_entry(dir, ent, off, reclen, page->index);
195 if (!err && off != limit) {
196 ent = ll_entry_at(addr, off);
197 CERROR("%s: entry in directory %lu/%u spans the page boundary "
198 "offset="LPU64"+%u, inode=%lu\n",
199 ll_i2mdcexp(dir)->exp_obd->obd_name,
200 dir->i_ino, dir->i_generation,
201 (__u64)page->index << CFS_PAGE_SHIFT,
202 off, (unsigned long)le32_to_cpu(ent->lde_inode));
207 SetPageChecked(page);
210 struct page *ll_get_dir_page(struct inode *dir, unsigned long n)
212 struct ldlm_res_id res_id;
213 struct lustre_handle lockh;
214 struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
215 struct address_space *mapping = dir->i_mapping;
217 ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
220 fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id);
221 rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
222 &res_id, LDLM_IBITS, &policy, LCK_CR, &lockh);
224 struct lookup_intent it = { .it_op = IT_READDIR };
225 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CR,
226 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir };
227 struct ptlrpc_request *request;
228 struct mdc_op_data data = { { 0 } };
230 ll_prepare_mdc_op_data(&data, dir, NULL, NULL, 0, 0, NULL);
232 rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, &einfo, &it,
233 &data, &lockh, NULL, 0, 0);
235 request = (struct ptlrpc_request *)it.d.lustre.it_data;
237 ptlrpc_req_finished(request);
239 CERROR("lock enqueue: rc: %d\n", rc);
243 ldlm_lock_dump_handle(D_OTHER, &lockh);
245 page = read_cache_page(mapping, n,
246 (filler_t*)mapping->a_ops->readpage, NULL);
248 GOTO(out_unlock, page);
252 if (!PageUptodate(page))
254 if (!PageChecked(page))
255 ll_dir_check_page(dir, page);
260 ldlm_lock_decref(&lockh, LCK_CR);
265 page = ERR_PTR(-EIO);
269 static inline unsigned ll_dir_validate_entry(char *base, unsigned offset,
272 struct ll_dir_entry *de = ll_entry_at(base, offset);
273 struct ll_dir_entry *p = ll_entry_at(base, offset & mask);
274 while (p < de && p->lde_rec_len > 0)
275 p = ll_dir_next_entry(p);
276 return (char *)p - base;
280 * File type constants. The same as in ext2 for compatibility.
295 static unsigned char ll_dir_filetype_table[LL_DIR_FT_MAX] = {
296 [LL_DIR_FT_UNKNOWN] = DT_UNKNOWN,
297 [LL_DIR_FT_REG_FILE] = DT_REG,
298 [LL_DIR_FT_DIR] = DT_DIR,
299 [LL_DIR_FT_CHRDEV] = DT_CHR,
300 [LL_DIR_FT_BLKDEV] = DT_BLK,
301 [LL_DIR_FT_FIFO] = DT_FIFO,
302 [LL_DIR_FT_SOCK] = DT_SOCK,
303 [LL_DIR_FT_SYMLINK] = DT_LNK,
307 * Process one page. Returns:
309 * -ve: filldir commands readdir to stop.
310 * +ve: number of entries submitted to filldir.
311 * 0: no live entries on this page.
314 static int ll_readdir_page(char *addr, __u64 base, unsigned *offset,
315 filldir_t filldir, void *cookie)
317 struct ll_dir_entry *de;
321 de = ll_entry_at(addr, *offset);
322 end = addr + CFS_PAGE_SIZE - ll_dir_rec_len(1);
323 for (nr = 0 ;(char*)de <= end; de = ll_dir_next_entry(de)) {
324 if (de->lde_inode != 0) {
326 *offset = (char *)de - addr;
327 if (filldir(cookie, de->lde_name, de->lde_name_len,
328 base | *offset, le32_to_cpu(de->lde_inode),
329 ll_dir_filetype_table[de->lde_file_type &
330 (LL_DIR_FT_MAX - 1)]))
337 static int ll_readdir_18(struct file *filp, void *dirent, filldir_t filldir)
339 struct inode *inode = filp->f_dentry->d_inode;
340 loff_t pos = filp->f_pos;
341 unsigned offset = pos & ~CFS_PAGE_MASK;
342 pgoff_t idx = pos >> CFS_PAGE_SHIFT;
343 pgoff_t npages = dir_pages(inode);
344 unsigned chunk_mask = ll_dir_page_mask(inode);
345 int need_revalidate = (filp->f_version != inode->i_version);
347 int done; /* when this becomes negative --- stop iterating */
351 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %llu/%llu\n",
352 inode->i_ino, inode->i_generation, inode,
353 pos, i_size_read(inode));
356 * Checking ->i_size without the lock. Should be harmless, as server
359 if (pos > i_size_read(inode) - ll_dir_rec_len(1))
362 for (done = 0; idx < npages; idx++, offset = 0) {
364 * We can assume that all blocks on this page are filled with
365 * entries, because ll_dir_check_page() placed special dummy
372 CDEBUG(D_EXT2,"read %lu of dir %lu/%u page %lu/%lu "
374 CFS_PAGE_SIZE, inode->i_ino, inode->i_generation,
375 idx, npages, i_size_read(inode));
376 page = ll_get_dir_page(inode, idx);
378 /* size might have been updated by mdc_readpage */
379 npages = dir_pages(inode);
383 CERROR("error reading dir %lu/%u page %lu: rc %d\n",
384 inode->i_ino, inode->i_generation, idx, rc);
388 kaddr = page_address(page);
389 if (need_revalidate) {
391 * File offset was changed by lseek() and possibly
392 * points in the middle of an entry. Re-scan from the
393 * beginning of the chunk.
395 offset = ll_dir_validate_entry(kaddr, offset,
399 done = ll_readdir_page(kaddr, idx << CFS_PAGE_SHIFT,
400 &offset, filldir, dirent);
404 * Some entries were sent to the user space, return
410 * filldir is satisfied.
415 filp->f_pos = (idx << CFS_PAGE_SHIFT) | offset;
416 filp->f_version = inode->i_version;
417 touch_atime(filp->f_vfsmnt, filp->f_dentry);
423 * Chain of hash overflow pages.
425 struct ll_dir_chain {
426 /* XXX something. Later */
429 static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
433 static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
437 static inline __u32 hash_x_index(__u32 value)
439 return ((__u32)~0) - value;
443 * Layout of readdir pages, as transmitted on wire.
446 struct lu_fid lde_fid;
455 __u64 ldp_hash_start;
460 struct lu_dirent ldp_entries[0];
463 enum lu_dirpage_flags {
467 static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
469 if (le16_to_cpu(dp->ldp_flags) & LDF_EMPTY)
472 return dp->ldp_entries;
475 static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
477 struct lu_dirent *next;
479 if (le16_to_cpu(ent->lde_reclen) != 0)
480 next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
487 static inline int lu_dirent_size(struct lu_dirent *ent)
489 if (le16_to_cpu(ent->lde_reclen) == 0) {
490 return (sizeof(*ent) +
491 le16_to_cpu(ent->lde_namelen) + 3) & ~3;
493 return le16_to_cpu(ent->lde_reclen);
496 #define DIR_END_OFF 0xfffffffffffffffeULL
498 #ifdef HAVE_RW_TREE_LOCK
499 #define TREE_READ_LOCK_IRQ(mapping) read_lock_irq(&(mapping)->tree_lock)
500 #define TREE_READ_UNLOCK_IRQ(mapping) read_unlock_irq(&(mapping)->tree_lock)
502 #define TREE_READ_LOCK_IRQ(mapping) spin_lock_irq(&(mapping)->tree_lock)
503 #define TREE_READ_UNLOCK_IRQ(mapping) spin_unlock_irq(&(mapping)->tree_lock)
506 /* returns the page unlocked, but with a reference */
507 static int ll_dir_readpage_20(struct file *file, struct page *page)
509 struct inode *inode = page->mapping->host;
510 struct ptlrpc_request *request;
511 struct mdt_body *body;
517 hash = hash_x_index(page->index);
518 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off %lu\n",
519 inode->i_ino, inode->i_generation, inode, (unsigned long)hash);
521 ll_inode2fid(&fid, inode);
522 rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &fid,
523 hash, page, &request);
525 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
527 /* Checked by mdc_readpage() */
528 LASSERT(body != NULL);
530 if (body->valid & OBD_MD_FLSIZE) {
531 ll_inode_size_lock(inode, 0);
532 i_size_write(inode, body->size);
533 ll_inode_size_unlock(inode, 0);
535 SetPageUptodate(page);
537 ptlrpc_req_finished(request);
545 static void ll_check_page(struct inode *dir, struct page *page)
547 /* XXX: check page format later */
548 SetPageChecked(page);
553 * Find, kmap and return page that contains given hash.
555 static struct page *ll_dir_page_locate(struct inode *dir, unsigned long hash,
556 __u64 *start, __u64 *end)
558 struct address_space *mapping = dir->i_mapping;
560 * Complement of hash is used as an index so that
561 * radix_tree_gang_lookup() can be used to find a page with starting
562 * hash _smaller_ than one we are looking for.
564 unsigned long offset = hash_x_index(hash);
569 TREE_READ_LOCK_IRQ(mapping);
570 found = radix_tree_gang_lookup(&mapping->page_tree,
571 (void **)&page, offset, 1);
573 struct lu_dirpage *dp;
575 page_cache_get(page);
576 TREE_READ_UNLOCK_IRQ(mapping);
578 * In contrast to find_lock_page() we are sure that directory
579 * page cannot be truncated (while DLM lock is held) and,
580 * hence, can avoid restart.
582 * In fact, page cannot be locked here at all, because
583 * ll_dir_readpage() does synchronous io.
586 if (PageUptodate(page)) {
588 *start = le64_to_cpu(dp->ldp_hash_start);
589 *end = le64_to_cpu(dp->ldp_hash_end);
590 LASSERT(*start <= hash);
591 if (hash > *end || (*end != *start && hash == *end)) {
594 ll_truncate_complete_page(page);
596 page_cache_release(page);
600 page_cache_release(page);
601 page = ERR_PTR(-EIO);
605 TREE_READ_UNLOCK_IRQ(mapping);
611 static struct page *ll_get_dir_page_20(struct inode *dir, __u64 hash, int exact,
612 struct ll_dir_chain *chain)
614 struct ldlm_res_id res_id;
615 struct lustre_handle lockh;
616 struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
617 struct address_space *mapping = dir->i_mapping;
618 struct lu_dirpage *dp;
620 ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
627 fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id);
629 rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
630 &res_id, LDLM_IBITS, &policy, mode, &lockh);
632 struct lookup_intent it = { .it_op = IT_READDIR };
633 struct ldlm_enqueue_info einfo = { LDLM_IBITS, mode,
634 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir };
635 struct ptlrpc_request *request;
636 struct mdc_op_data op_data = { { 0 } };
638 ll_prepare_mdc_op_data(&op_data, dir, NULL, NULL, 0, 0, NULL);
640 rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, &einfo, &it,
641 &op_data, &lockh, NULL, 0, 0);
643 request = (struct ptlrpc_request *)it.d.lustre.it_data;
645 ptlrpc_req_finished(request);
647 CERROR("lock enqueue: rc: %d\n", rc);
651 ldlm_lock_dump_handle(D_OTHER, &lockh);
653 page = ll_dir_page_locate(dir, hash, &start, &end);
655 GOTO(out_unlock, page);
659 * XXX nikita: not entirely correct handling of a corner case:
660 * suppose hash chain of entries with hash value HASH crosses
661 * border between pages P0 and P1. First both P0 and P1 are
662 * cached, seekdir() is called for some entry from the P0 part
663 * of the chain. Later P0 goes out of cache. telldir(HASH)
664 * happens and finds P1, as it starts with matching hash
665 * value. Remaining entries from P0 part of the chain are
666 * skipped. (Is that really a bug?)
668 * Possible solutions: 0. don't cache P1 is such case, handle
669 * it as an "overflow" page. 1. invalidate all pages at
670 * once. 2. use HASH|1 as an index for P1.
672 if (exact && hash != start) {
674 * readdir asked for a page starting _exactly_ from
675 * given hash, but cache contains stale page, with
676 * entries with smaller hash values. Stale page should
677 * be invalidated, and new one fetched.
679 CDEBUG(D_INFO, "Stale readpage page %p: %#lx != %#lx\n",
680 page, (unsigned long)hash, (unsigned long)start);
682 ll_truncate_complete_page(page);
684 page_cache_release(page);
686 GOTO(hash_collision, page);
690 page = read_cache_page(mapping, hash_x_index(hash),
691 (filler_t*)ll_dir_readpage_20, NULL);
693 GOTO(out_unlock, page);
697 if (!PageUptodate(page))
699 if (!PageChecked(page))
700 ll_check_page(dir, page);
704 dp = page_address(page);
706 start = le64_to_cpu(dp->ldp_hash_start);
707 end = le64_to_cpu(dp->ldp_hash_end);
709 LASSERT(start == hash);
710 CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end);
712 * Fetch whole overflow chain...
719 ldlm_lock_decref(&lockh, mode);
724 page = ERR_PTR(-EIO);
728 static int ll_readdir_20(struct file *filp, void *cookie, filldir_t filldir)
730 struct inode *inode = filp->f_dentry->d_inode;
731 struct ll_sb_info *sbi = ll_i2sbi(inode);
732 __u64 pos = filp->f_pos;
734 struct ll_dir_chain chain;
740 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu\n",
741 inode->i_ino, inode->i_generation, inode,
742 (unsigned long)pos, i_size_read(inode));
744 if (pos == DIR_END_OFF)
753 ll_dir_chain_init(&chain);
755 page = ll_get_dir_page_20(inode, pos, 0, &chain);
757 while (rc == 0 && !done) {
758 struct lu_dirpage *dp;
759 struct lu_dirent *ent;
763 * If page is empty (end of directoryis reached),
766 __u64 hash = DIR_END_OFF;
769 dp = page_address(page);
770 for (ent = lu_dirent_start(dp); ent != NULL && !done;
771 ent = lu_dirent_next(ent)) {
777 hash = le64_to_cpu(ent->lde_hash);
778 namelen = le16_to_cpu(ent->lde_namelen);
782 * Skip until we find target hash
794 name = ent->lde_name;
795 fid_le_to_cpu(&fid, &fid);
796 ino = ll_fid_build_ino(sbi, (struct ll_fid*)&fid);
798 done = filldir(cookie, name, namelen,
799 (loff_t)hash, ino, DT_UNKNOWN);
801 next = le64_to_cpu(dp->ldp_hash_end);
805 if (pos == DIR_END_OFF)
807 * End of directory reached.
810 else if (1 /* chain is exhausted*/)
812 * Normal case: continue to the next
815 page = ll_get_dir_page_20(inode, pos, 1,
819 * go into overflow page.
827 CERROR("error reading dir "DFID" at %lu: rc %d\n",
828 PFID(ll_inode_lu_fid(inode)),
829 (unsigned long)pos, rc);
833 filp->f_pos = (loff_t)(__s32)pos;
834 filp->f_version = inode->i_version;
835 touch_atime(filp->f_vfsmnt, filp->f_dentry);
837 ll_dir_chain_fini(&chain);
842 static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
844 struct inode *inode = filp->f_dentry->d_inode;
845 struct ll_sb_info *sbi = ll_i2sbi(inode);
847 if (sbi->ll_mdc_exp->exp_connect_flags & OBD_CONNECT_FID) {
848 return ll_readdir_20(filp, cookie, filldir);
850 return ll_readdir_18(filp, cookie, filldir);
854 #define QCTL_COPY(out, in) \
856 Q_COPY(out, in, qc_cmd); \
857 Q_COPY(out, in, qc_type); \
858 Q_COPY(out, in, qc_id); \
859 Q_COPY(out, in, qc_stat); \
860 Q_COPY(out, in, qc_dqinfo); \
861 Q_COPY(out, in, qc_dqblk); \
864 static int ll_send_mgc_param(struct obd_export *mgc, char *string)
866 struct mgs_send_param *msp;
873 strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
874 rc = obd_set_info_async(mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
875 sizeof(struct mgs_send_param), msp, NULL);
877 CERROR("Failed to set parameter: %d\n", rc);
883 static char *ll_get_fsname(struct inode *inode)
885 struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
889 OBD_ALLOC(fsname, MGS_PARAM_MAXLEN);
890 len = strlen(lsi->lsi_lmd->lmd_profile);
891 ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
892 if (ptr && (strcmp(ptr, "-client") == 0))
894 strncpy(fsname, lsi->lsi_lmd->lmd_profile, len);
900 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
903 struct ll_sb_info *sbi = ll_i2sbi(inode);
904 struct mdc_op_data data = { { 0 } };
905 struct ptlrpc_request *req = NULL;
906 struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
907 struct obd_device *mgc = lsi->lsi_mgc;
908 char *fsname = NULL, *param = NULL;
911 struct iattr attr = { 0 };
915 * This is coming from userspace, so should be in
916 * local endian. But the MDS would like it in little
917 * endian, so we swab it before we send it.
919 switch (lump->lmm_magic) {
920 case LOV_USER_MAGIC_V1: {
921 if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
922 lustre_swab_lov_user_md_v1(lump);
923 lum_size = sizeof(struct lov_user_md_v1);
926 case LOV_USER_MAGIC_V3: {
927 if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
928 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lump);
929 lum_size = sizeof(struct lov_user_md_v3);
933 CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
934 " %#08x != %#08x nor %#08x\n",
935 lump->lmm_magic, LOV_USER_MAGIC_V1,
941 ll_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0, NULL);
943 /* swabbing is done in lov_setstripe() on server side */
944 rc = mdc_setattr(sbi->ll_mdc_exp, &data,
945 &attr, lump, lum_size, NULL, 0, &req);
947 ptlrpc_req_finished(req);
948 if (rc != -EPERM && rc != -EACCES)
949 CERROR("mdc_setattr fails: rc = %d\n", rc);
952 ptlrpc_req_finished(req);
954 /* In the following we use the fact that LOV_USER_MAGIC_V1 and
955 LOV_USER_MAGIC_V3 have the same initial fields so we do not
956 need the make the distiction between the 2 versions */
957 if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
958 OBD_ALLOC(param, MGS_PARAM_MAXLEN);
960 /* Get fsname and assume devname to be -MDT0000. */
961 fsname = ll_get_fsname(inode);
962 /* Set root stripesize */
963 sprintf(param, "%s-MDT0000.lov.stripesize=%u", fsname,
964 le32_to_cpu(lump->lmm_stripe_size));
965 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
969 /* Set root stripecount */
970 sprintf(param, "%s-MDT0000.lov.stripecount=%u", fsname,
971 le16_to_cpu(lump->lmm_stripe_count));
972 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
976 /* Set root stripeoffset */
977 sprintf(param, "%s-MDT0000.lov.stripeoffset=%u", fsname,
978 le16_to_cpu(lump->lmm_stripe_offset));
979 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
984 OBD_FREE(fsname, MGS_PARAM_MAXLEN);
986 OBD_FREE(param, MGS_PARAM_MAXLEN);
991 int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
992 int *lmm_size, struct ptlrpc_request **request)
994 struct ll_sb_info *sbi = ll_i2sbi(inode);
996 struct mds_body *body;
997 struct lov_mds_md *lmm = NULL;
998 struct ptlrpc_request *req = NULL;
1001 ll_inode2fid(&fid, inode);
1003 rc = ll_get_max_mdsize(sbi, &lmmsize);
1007 rc = mdc_getattr(sbi->ll_mdc_exp, &fid,
1008 OBD_MD_FLEASIZE|OBD_MD_FLDIREA,
1011 CDEBUG(D_INFO, "mdc_getattr failed on inode "
1012 "%lu/%u: rc %d\n", inode->i_ino,
1013 inode->i_generation, rc);
1016 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
1018 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1019 /* swabbed by mdc_getattr_name */
1020 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
1022 lmmsize = body->eadatasize;
1024 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1026 GOTO(out, rc = -ENODATA);
1029 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1030 LASSERT(lmm != NULL);
1031 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
1034 * This is coming from the MDS, so is probably in
1035 * little endian. We convert it to host endian before
1036 * passing it to userspace.
1038 /* We don't swab objects for directories */
1039 switch (le32_to_cpu(lmm->lmm_magic)) {
1041 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
1042 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1045 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
1046 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1049 CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
1055 *lmm_size = lmmsize;
1060 static int ll_dir_ioctl(struct inode *inode, struct file *file,
1061 unsigned int cmd, unsigned long arg)
1063 struct ll_sb_info *sbi = ll_i2sbi(inode);
1064 struct obd_ioctl_data *data;
1067 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n",
1068 inode->i_ino, inode->i_generation, inode, cmd);
1070 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1071 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1074 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1076 case EXT3_IOC_GETFLAGS:
1077 case EXT3_IOC_SETFLAGS:
1078 RETURN(ll_iocontrol(inode, file, cmd, arg));
1079 case EXT3_IOC_GETVERSION_OLD:
1080 case EXT3_IOC_GETVERSION:
1081 RETURN(put_user(inode->i_generation, (int *)arg));
1082 /* We need to special case any other ioctls we want to handle,
1083 * to send them to the MDS/OST as appropriate and to properly
1084 * network encode the arg field.
1085 case EXT3_IOC_SETVERSION_OLD:
1086 case EXT3_IOC_SETVERSION:
1088 case IOC_MDC_LOOKUP: {
1089 struct ptlrpc_request *request = NULL;
1093 int namelen, rc, len = 0;
1095 rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
1100 filename = data->ioc_inlbuf1;
1101 namelen = data->ioc_inllen1;
1104 CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
1105 GOTO(out, rc = -EINVAL);
1108 ll_inode2fid(&fid, inode);
1109 rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid, filename, namelen,
1110 OBD_MD_FLID, 0, &request);
1112 CDEBUG(D_INFO, "mdc_getattr_name: %d\n", rc);
1116 ptlrpc_req_finished(request);
1120 obd_ioctl_freedata(buf, len);
1123 case LL_IOC_LOV_SETSTRIPE: {
1124 struct lov_user_md_v3 lumv3;
1125 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1126 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1127 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1130 int set_default = 0;
1132 LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
1133 LASSERT(sizeof(lumv3.lmm_objects[0]) ==
1134 sizeof(lumv3p->lmm_objects[0]));
1136 /* first try with v1 which is smaller than v3 */
1137 rc = copy_from_user(lumv1, lumv1p, sizeof(*lumv1));
1141 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1142 rc = copy_from_user(&lumv3, lumv3p, sizeof(lumv3));
1147 if (inode->i_sb->s_root == file->f_dentry)
1150 /* in v1 and v3 cases lumv1 points to data */
1151 rc = ll_dir_setstripe(inode, lumv1, set_default);
1155 case LL_IOC_OBD_STATFS:
1156 RETURN(ll_obd_statfs(inode, (void *)arg));
1157 case LL_IOC_LOV_GETSTRIPE:
1158 case LL_IOC_MDC_GETINFO:
1159 case IOC_MDC_GETFILEINFO:
1160 case IOC_MDC_GETFILESTRIPE: {
1161 struct ptlrpc_request *request = NULL;
1162 struct mds_body *body;
1163 struct lov_user_md *lump;
1164 struct lov_mds_md *lmm = NULL;
1165 char *filename = NULL;
1168 if (cmd == IOC_MDC_GETFILEINFO ||
1169 cmd == IOC_MDC_GETFILESTRIPE) {
1170 filename = getname((const char *)arg);
1171 if (IS_ERR(filename))
1172 RETURN(PTR_ERR(filename));
1174 rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
1175 &lmmsize, &request);
1177 rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
1181 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
1183 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1184 /* swabbed by mdc_getattr_name */
1185 LASSERT(lustre_rep_swabbed(request, REPLY_REC_OFF));
1191 if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
1192 cmd == LL_IOC_MDC_GETINFO))
1193 GOTO(skip_lmm, rc = 0);
1198 if (cmd == IOC_MDC_GETFILESTRIPE ||
1199 cmd == LL_IOC_LOV_GETSTRIPE) {
1200 lump = (struct lov_user_md *)arg;
1202 struct lov_user_mds_data *lmdp;
1203 lmdp = (struct lov_user_mds_data *)arg;
1204 lump = &lmdp->lmd_lmm;
1206 rc = copy_to_user(lump, lmm, lmmsize);
1208 GOTO(out_lmm, rc = -EFAULT);
1210 if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
1211 struct lov_user_mds_data *lmdp;
1214 st.st_dev = inode->i_sb->s_dev;
1215 st.st_mode = body->mode;
1216 st.st_nlink = body->nlink;
1217 st.st_uid = body->uid;
1218 st.st_gid = body->gid;
1219 st.st_rdev = body->rdev;
1220 st.st_size = body->size;
1221 st.st_blksize = CFS_PAGE_SIZE;
1222 st.st_blocks = body->blocks;
1223 st.st_atime = body->atime;
1224 st.st_mtime = body->mtime;
1225 st.st_ctime = body->ctime;
1226 st.st_ino = body->ino;
1228 lmdp = (struct lov_user_mds_data *)arg;
1229 rc = copy_to_user(&lmdp->lmd_st, &st, sizeof(st));
1231 GOTO(out_lmm, rc = -EFAULT);
1236 if (lmm && lmm->lmm_magic == LOV_MAGIC_JOIN)
1237 OBD_FREE(lmm, lmmsize);
1239 ptlrpc_req_finished(request);
1244 case IOC_LOV_GETINFO: {
1245 struct lov_user_mds_data *lumd;
1246 struct lov_stripe_md *lsm;
1247 struct lov_user_md *lum;
1248 struct lov_mds_md *lmm;
1253 lumd = (struct lov_user_mds_data *)arg;
1254 lum = &lumd->lmd_lmm;
1256 rc = ll_get_max_mdsize(sbi, &lmmsize);
1260 OBD_ALLOC(lmm, lmmsize);
1261 rc = copy_from_user(lmm, lum, lmmsize);
1263 GOTO(free_lmm, rc = -EFAULT);
1265 switch (lmm->lmm_magic) {
1266 case LOV_USER_MAGIC_V1:
1267 if (LOV_USER_MAGIC == cpu_to_le32(LOV_USER_MAGIC))
1269 /* swab objects first so that stripes num will be sane */
1270 lustre_swab_lov_user_md_objects(
1271 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1272 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1273 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1275 case LOV_USER_MAGIC_V3:
1276 if (LOV_USER_MAGIC == cpu_to_le32(LOV_USER_MAGIC))
1278 /* swab objects first so that stripes num will be sane */
1279 lustre_swab_lov_user_md_objects(
1280 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1281 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1282 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1285 GOTO(free_lmm, rc = -EINVAL);
1288 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
1290 GOTO(free_lmm, rc = -ENOMEM);
1292 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
1296 /* Perform glimpse_size operation. */
1297 memset(&st, 0, sizeof(st));
1299 rc = ll_glimpse_ioctl(sbi, lsm, &st);
1303 rc = copy_to_user(&lumd->lmd_st, &st, sizeof(st));
1305 GOTO(free_lsm, rc = -EFAULT);
1309 obd_free_memmd(sbi->ll_osc_exp, &lsm);
1311 OBD_FREE(lmm, lmmsize);
1314 case OBD_IOC_LLOG_CATINFO: {
1315 struct ptlrpc_request *req = NULL;
1318 char *bufs[3] = { NULL }, *str;
1319 int lens[3] = { sizeof(struct ptlrpc_body) };
1320 int size[2] = { sizeof(struct ptlrpc_body) };
1322 rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
1327 if (!data->ioc_inlbuf1) {
1328 obd_ioctl_freedata(buf, len);
1332 lens[REQ_REC_OFF] = data->ioc_inllen1;
1333 bufs[REQ_REC_OFF] = data->ioc_inlbuf1;
1334 if (data->ioc_inllen2) {
1335 lens[REQ_REC_OFF + 1] = data->ioc_inllen2;
1336 bufs[REQ_REC_OFF + 1] = data->ioc_inlbuf2;
1338 lens[REQ_REC_OFF + 1] = 0;
1339 bufs[REQ_REC_OFF + 1] = NULL;
1342 req = ptlrpc_prep_req(sbi2mdc(sbi)->cl_import,
1343 LUSTRE_LOG_VERSION, LLOG_CATINFO, 3, lens,
1346 GOTO(out_catinfo, rc = -ENOMEM);
1348 size[REPLY_REC_OFF] = data->ioc_plen1;
1349 ptlrpc_req_set_repsize(req, 2, size);
1351 rc = ptlrpc_queue_wait(req);
1352 str = lustre_msg_string(req->rq_repmsg, REPLY_REC_OFF,
1355 rc = copy_to_user(data->ioc_pbuf1, str,data->ioc_plen1);
1356 ptlrpc_req_finished(req);
1358 obd_ioctl_freedata(buf, len);
1361 case OBD_IOC_QUOTACHECK: {
1362 struct obd_quotactl *oqctl;
1365 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1368 OBD_ALLOC_PTR(oqctl);
1371 oqctl->qc_type = arg;
1372 rc = obd_quotacheck(sbi->ll_mdc_exp, oqctl);
1374 CDEBUG(D_INFO, "mdc_quotacheck failed: rc %d\n", rc);
1378 rc = obd_quotacheck(sbi->ll_osc_exp, oqctl);
1380 CDEBUG(D_INFO, "osc_quotacheck failed: rc %d\n", rc);
1382 OBD_FREE_PTR(oqctl);
1385 case OBD_IOC_POLL_QUOTACHECK: {
1386 struct if_quotacheck *check;
1389 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1392 OBD_ALLOC_PTR(check);
1396 rc = obd_iocontrol(cmd, sbi->ll_mdc_exp, 0, (void *)check,
1399 CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
1400 if (copy_to_user((void *)arg, check, sizeof(*check)))
1405 rc = obd_iocontrol(cmd, sbi->ll_osc_exp, 0, (void *)check,
1408 CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
1409 if (copy_to_user((void *)arg, check, sizeof(*check)))
1414 OBD_FREE_PTR(check);
1417 case OBD_IOC_QUOTACTL: {
1418 struct if_quotactl *qctl;
1419 struct obd_quotactl *oqctl;
1421 int cmd, type, id, rc = 0;
1423 OBD_ALLOC_PTR(qctl);
1427 OBD_ALLOC_PTR(oqctl);
1432 if (copy_from_user(qctl, (void *)arg, sizeof(*qctl)))
1433 GOTO(out_quotactl, rc = -EFAULT);
1436 type = qctl->qc_type;
1439 case LUSTRE_Q_INVALIDATE:
1440 case LUSTRE_Q_FINVALIDATE:
1445 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1446 GOTO(out_quotactl, rc = -EPERM);
1449 if (((type == USRQUOTA && current->euid != id) ||
1450 (type == GRPQUOTA && !in_egroup_p(id))) &&
1451 !cfs_capable(CFS_CAP_SYS_ADMIN))
1452 GOTO(out_quotactl, rc = -EPERM);
1454 /* XXX: dqb_valid is borrowed as a flag to mark that
1455 * only mds quota is wanted */
1456 if (qctl->qc_dqblk.dqb_valid) {
1457 qctl->obd_uuid = sbi->ll_mdc_exp->exp_obd->
1458 u.cli.cl_target_uuid;
1459 qctl->qc_dqblk.dqb_valid = 0;
1466 CERROR("unsupported quotactl op: %#x\n", cmd);
1467 GOTO(out_quotactl, -ENOTTY);
1470 QCTL_COPY(oqctl, qctl);
1472 if (qctl->obd_uuid.uuid[0]) {
1473 struct obd_device *obd;
1474 struct obd_uuid *uuid = &qctl->obd_uuid;
1476 obd = class_find_client_notype(uuid,
1477 &sbi->ll_osc_exp->exp_obd->obd_uuid);
1479 GOTO(out_quotactl, rc = -ENOENT);
1481 if (cmd == Q_GETINFO)
1482 oqctl->qc_cmd = Q_GETOINFO;
1483 else if (cmd == Q_GETQUOTA)
1484 oqctl->qc_cmd = Q_GETOQUOTA;
1486 GOTO(out_quotactl, rc = -EINVAL);
1488 if (sbi->ll_mdc_exp->exp_obd == obd) {
1489 rc = obd_quotactl(sbi->ll_mdc_exp, oqctl);
1492 struct obd_export *exp;
1493 struct lov_obd *lov = &sbi->ll_osc_exp->
1496 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
1497 if (!lov->lov_tgts[i] ||
1498 !lov->lov_tgts[i]->ltd_active)
1500 exp = lov->lov_tgts[i]->ltd_exp;
1501 if (exp->exp_obd == obd) {
1502 rc = obd_quotactl(exp, oqctl);
1508 oqctl->qc_cmd = cmd;
1509 QCTL_COPY(qctl, oqctl);
1511 if (copy_to_user((void *)arg, qctl, sizeof(*qctl)))
1514 GOTO(out_quotactl, rc);
1517 rc = obd_quotactl(sbi->ll_mdc_exp, oqctl);
1518 if (rc && rc != -EBUSY && cmd == Q_QUOTAON) {
1519 oqctl->qc_cmd = Q_QUOTAOFF;
1520 obd_quotactl(sbi->ll_mdc_exp, oqctl);
1523 QCTL_COPY(qctl, oqctl);
1525 if (copy_to_user((void *)arg, qctl, sizeof(*qctl)))
1529 OBD_FREE_PTR(oqctl);
1532 case OBD_IOC_GETNAME_OLD:
1533 case OBD_IOC_GETNAME: {
1534 struct obd_device *obd = class_exp2obd(sbi->ll_osc_exp);
1537 if (copy_to_user((void *)arg, obd->obd_name,
1538 strlen(obd->obd_name) + 1))
1543 RETURN(obd_iocontrol(cmd, sbi->ll_osc_exp,0,NULL,(void *)arg));
1547 struct file_operations ll_dir_operations = {
1548 .open = ll_file_open,
1549 .release = ll_file_release,
1550 .read = generic_read_dir,
1551 .readdir = ll_readdir,
1552 .ioctl = ll_dir_ioctl