1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Directory code for lustre client.
42 #include <linux/pagemap.h>
44 #include <linux/version.h>
45 #include <linux/smp_lock.h>
46 #include <asm/uaccess.h>
47 #include <linux/buffer_head.h> // for wait_on_buffer
49 #define DEBUG_SUBSYSTEM S_LLITE
51 #include <obd_support.h>
52 #include <obd_class.h>
53 #include <lustre_lib.h>
54 #include <lustre/lustre_idl.h>
55 #include <lustre_lite.h>
56 #include <lustre_dlm.h>
57 #include "llite_internal.h"
59 #ifndef HAVE_PAGE_CHECKED
60 #ifdef HAVE_PG_FS_MISC
61 #define PageChecked(page) test_bit(PG_fs_misc, &(page)->flags)
62 #define SetPageChecked(page) set_bit(PG_fs_misc, &(page)->flags)
64 #error PageChecked or PageFsMisc not defined in kernel
68 /* returns the page unlocked, but with a reference */
69 static int ll_dir_readpage(struct file *file, struct page *page)
71 struct inode *inode = page->mapping->host;
72 struct ll_fid mdc_fid;
74 struct ptlrpc_request *request;
75 struct mds_body *body;
79 offset = (__u64)page->index << CFS_PAGE_SHIFT;
80 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off "LPU64"\n",
81 inode->i_ino, inode->i_generation, inode, offset);
83 ll_pack_fid(&mdc_fid, inode->i_ino, inode->i_generation, S_IFDIR);
85 rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &mdc_fid,
86 offset, page, &request);
88 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
90 LASSERT(body != NULL); /* checked by mdc_readpage() */
91 /* swabbed by mdc_readpage() */
92 LASSERT(lustre_rep_swabbed(request, REPLY_REC_OFF));
94 if (body->size != i_size_read(inode)) {
95 ll_inode_size_lock(inode, 0);
96 i_size_write(inode, body->size);
97 ll_inode_size_unlock(inode, 0);
100 SetPageUptodate(page);
102 ptlrpc_req_finished(request);
109 struct address_space_operations ll_dir_aops = {
110 .readpage = ll_dir_readpage,
113 static inline unsigned ll_dir_page_mask(struct inode *inode)
115 return ~(inode->i_sb->s_blocksize - 1);
119 * Check consistency of a single entry.
121 static int ll_dir_check_entry(struct inode *dir, struct ll_dir_entry *ent,
122 unsigned offset, unsigned rec_len, pgoff_t index)
127 * Consider adding more checks.
130 if (unlikely(rec_len < ll_dir_rec_len(1)))
131 msg = "entry is too short";
132 else if (unlikely(rec_len & 3))
133 msg = "wrong alignment";
134 else if (unlikely(rec_len < ll_dir_rec_len(ent->lde_name_len)))
135 msg = "rec_len doesn't match name_len";
136 else if (unlikely(((offset + rec_len - 1) ^ offset) &
137 ll_dir_page_mask(dir)))
138 msg = "directory entry across blocks";
141 CERROR("%s: bad entry in directory %lu/%u: %s - "
142 "offset=%lu+%u, inode=%lu, rec_len=%d,"
143 " name_len=%d\n", ll_i2mdcexp(dir)->exp_obd->obd_name,
144 dir->i_ino, dir->i_generation, msg,
145 index << CFS_PAGE_SHIFT,
146 offset, (unsigned long)le32_to_cpu(ent->lde_inode),
147 rec_len, ent->lde_name_len);
151 static void ll_dir_check_page(struct inode *dir, struct page *page)
154 unsigned size = dir->i_sb->s_blocksize;
155 char *addr = page_address(page);
160 struct ll_dir_entry *ent;
163 if ((i_size_read(dir) >> CFS_PAGE_SHIFT) == (__u64)page->index) {
167 limit = i_size_read(dir) & ~CFS_PAGE_MASK;
168 if (limit & (size - 1)) {
169 CERROR("%s: dir %lu/%u size %llu doesn't match %u\n",
170 ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino,
171 dir->i_generation, i_size_read(dir), size);
175 * Place dummy forwarding entries to streamline
178 for (off = limit; off < CFS_PAGE_SIZE; off += size) {
179 ent = ll_entry_at(addr, off);
180 ent->lde_rec_len = cpu_to_le16(size);
181 ent->lde_name_len = 0;
186 limit = CFS_PAGE_SIZE;
189 !err && off <= limit - ll_dir_rec_len(1); off += reclen) {
190 ent = ll_entry_at(addr, off);
191 reclen = le16_to_cpu(ent->lde_rec_len);
192 err = ll_dir_check_entry(dir, ent, off, reclen, page->index);
195 if (!err && off != limit) {
196 ent = ll_entry_at(addr, off);
197 CERROR("%s: entry in directory %lu/%u spans the page boundary "
198 "offset="LPU64"+%u, inode=%lu\n",
199 ll_i2mdcexp(dir)->exp_obd->obd_name,
200 dir->i_ino, dir->i_generation,
201 (__u64)page->index << CFS_PAGE_SHIFT,
202 off, (unsigned long)le32_to_cpu(ent->lde_inode));
207 SetPageChecked(page);
210 struct page *ll_get_dir_page(struct inode *dir, unsigned long n)
212 struct ldlm_res_id res_id;
213 struct lustre_handle lockh;
214 struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
215 struct address_space *mapping = dir->i_mapping;
217 ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
220 fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id);
221 rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
222 &res_id, LDLM_IBITS, &policy, LCK_CR, &lockh);
224 struct lookup_intent it = { .it_op = IT_READDIR };
225 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CR,
226 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir };
227 struct ptlrpc_request *request;
228 struct mdc_op_data data = { { 0 } };
230 ll_prepare_mdc_op_data(&data, dir, NULL, NULL, 0, 0, NULL);
232 rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, &einfo, &it,
233 &data, &lockh, NULL, 0, 0);
235 request = (struct ptlrpc_request *)it.d.lustre.it_data;
237 ptlrpc_req_finished(request);
239 CERROR("lock enqueue: rc: %d\n", rc);
243 ldlm_lock_dump_handle(D_OTHER, &lockh);
245 page = read_cache_page(mapping, n,
246 (filler_t*)mapping->a_ops->readpage, NULL);
248 GOTO(out_unlock, page);
252 if (!PageUptodate(page))
254 if (!PageChecked(page))
255 ll_dir_check_page(dir, page);
260 ldlm_lock_decref(&lockh, LCK_CR);
265 page = ERR_PTR(-EIO);
269 static inline unsigned ll_dir_validate_entry(char *base, unsigned offset,
272 struct ll_dir_entry *de = ll_entry_at(base, offset);
273 struct ll_dir_entry *p = ll_entry_at(base, offset & mask);
274 while (p < de && p->lde_rec_len > 0)
275 p = ll_dir_next_entry(p);
276 return (char *)p - base;
280 * File type constants. The same as in ext2 for compatibility.
295 static unsigned char ll_dir_filetype_table[LL_DIR_FT_MAX] = {
296 [LL_DIR_FT_UNKNOWN] = DT_UNKNOWN,
297 [LL_DIR_FT_REG_FILE] = DT_REG,
298 [LL_DIR_FT_DIR] = DT_DIR,
299 [LL_DIR_FT_CHRDEV] = DT_CHR,
300 [LL_DIR_FT_BLKDEV] = DT_BLK,
301 [LL_DIR_FT_FIFO] = DT_FIFO,
302 [LL_DIR_FT_SOCK] = DT_SOCK,
303 [LL_DIR_FT_SYMLINK] = DT_LNK,
307 * Process one page. Returns:
309 * -ve: filldir commands readdir to stop.
310 * +ve: number of entries submitted to filldir.
311 * 0: no live entries on this page.
314 static int ll_readdir_page(char *addr, __u64 base, unsigned *offset,
315 filldir_t filldir, void *cookie)
317 struct ll_dir_entry *de;
321 de = ll_entry_at(addr, *offset);
322 end = addr + CFS_PAGE_SIZE - ll_dir_rec_len(1);
323 for (nr = 0 ;(char*)de <= end; de = ll_dir_next_entry(de)) {
324 if (de->lde_inode != 0) {
326 *offset = (char *)de - addr;
327 if (filldir(cookie, de->lde_name, de->lde_name_len,
328 base | *offset, le32_to_cpu(de->lde_inode),
329 ll_dir_filetype_table[de->lde_file_type &
330 (LL_DIR_FT_MAX - 1)]))
337 static int ll_readdir_18(struct file *filp, void *dirent, filldir_t filldir)
339 struct inode *inode = filp->f_dentry->d_inode;
340 loff_t pos = filp->f_pos;
341 unsigned offset = pos & ~CFS_PAGE_MASK;
342 pgoff_t idx = pos >> CFS_PAGE_SHIFT;
343 pgoff_t npages = dir_pages(inode);
344 unsigned chunk_mask = ll_dir_page_mask(inode);
345 int need_revalidate = (filp->f_version != inode->i_version);
347 int done; /* when this becomes negative --- stop iterating */
351 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %llu/%llu\n",
352 inode->i_ino, inode->i_generation, inode,
353 pos, i_size_read(inode));
356 * Checking ->i_size without the lock. Should be harmless, as server
359 if (pos > i_size_read(inode) - ll_dir_rec_len(1))
362 for (done = 0; idx < npages; idx++, offset = 0) {
364 * We can assume that all blocks on this page are filled with
365 * entries, because ll_dir_check_page() placed special dummy
372 CDEBUG(D_EXT2,"read %lu of dir %lu/%u page %lu/%lu "
374 CFS_PAGE_SIZE, inode->i_ino, inode->i_generation,
375 idx, npages, i_size_read(inode));
376 page = ll_get_dir_page(inode, idx);
378 /* size might have been updated by mdc_readpage */
379 npages = dir_pages(inode);
383 CERROR("error reading dir %lu/%u page %lu: rc %d\n",
384 inode->i_ino, inode->i_generation, idx, rc);
388 kaddr = page_address(page);
389 if (need_revalidate) {
391 * File offset was changed by lseek() and possibly
392 * points in the middle of an entry. Re-scan from the
393 * beginning of the chunk.
395 offset = ll_dir_validate_entry(kaddr, offset,
399 done = ll_readdir_page(kaddr, idx << CFS_PAGE_SHIFT,
400 &offset, filldir, dirent);
404 * Some entries were sent to the user space, return
410 * filldir is satisfied.
415 filp->f_pos = (idx << CFS_PAGE_SHIFT) | offset;
416 filp->f_version = inode->i_version;
417 touch_atime(filp->f_vfsmnt, filp->f_dentry);
423 * Chain of hash overflow pages.
425 struct ll_dir_chain {
426 /* XXX something. Later */
429 static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
433 static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
437 static inline __u32 hash_x_index(__u32 value)
439 return ((__u32)~0) - value;
443 * Layout of readdir pages, as transmitted on wire.
446 /** valid if LUDA_FID is set. */
447 struct lu_fid lde_fid;
448 /** a unique entry identifier: a hash or an offset. */
450 /** total record length, including all attributes. */
454 /** optional variable size attributes following this entry.
455 * taken from enum lu_dirent_attrs.
458 /** name is followed by the attributes indicated in ->ldp_attrs, in
459 * their natural order. After the last attribute, padding bytes are
460 * added to make ->lde_reclen a multiple of 8.
466 __u64 ldp_hash_start;
471 struct lu_dirent ldp_entries[0];
475 * Definitions of optional directory entry attributes formats.
477 * Individual attributes do not have their length encoded in a generic way. It
478 * is assumed that consumer of an attribute knows its format. This means that
479 * it is impossible to skip over an unknown attribute, except by skipping over all
480 * remaining attributes (by using ->lde_reclen), which is not too
481 * constraining, because new server versions will append new attributes at
482 * the end of an entry.
486 * Fid directory attribute: a fid of an object referenced by the entry. This
487 * will be almost always requested by the client and supplied by the server.
489 * Aligned to 8 bytes.
491 /* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
496 * Aligned to 2 bytes.
502 enum lu_dirpage_flags {
506 static inline int lu_dirent_calc_size(int namelen, __u16 attr)
510 if (attr & LUDA_TYPE) {
511 const unsigned align = sizeof(struct luda_type) - 1;
512 size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
513 size += sizeof(struct luda_type);
515 size = sizeof(struct lu_dirent) + namelen;
517 return (size + 7) & ~7;
521 * return IF_* type for given lu_dirent entry.
522 * IF_* flag shld be converted to particular OS file type in
523 * platform llite module.
525 __u16 ll_dirent_type_get(struct lu_dirent *ent)
528 struct luda_type *lt;
531 if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
532 const unsigned align = sizeof(struct luda_type) - 1;
534 len = le16_to_cpu(ent->lde_namelen);
535 len = (len + align) & ~align;
536 lt = (void *) ent->lde_name + len;
537 type = CFS_IFTODT(le16_to_cpu(lt->lt_type));
542 static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
544 if (le16_to_cpu(dp->ldp_flags) & LDF_EMPTY)
547 return dp->ldp_entries;
550 static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
552 struct lu_dirent *next;
554 if (le16_to_cpu(ent->lde_reclen) != 0)
555 next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
562 static inline int lu_dirent_size(struct lu_dirent *ent)
564 if (le16_to_cpu(ent->lde_reclen) == 0) {
565 return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen),
566 le32_to_cpu(ent->lde_attrs));
568 return le16_to_cpu(ent->lde_reclen);
571 #define DIR_END_OFF 0xfffffffffffffffeULL
573 #ifdef HAVE_RW_TREE_LOCK
574 #define TREE_READ_LOCK_IRQ(mapping) read_lock_irq(&(mapping)->tree_lock)
575 #define TREE_READ_UNLOCK_IRQ(mapping) read_unlock_irq(&(mapping)->tree_lock)
577 #define TREE_READ_LOCK_IRQ(mapping) spin_lock_irq(&(mapping)->tree_lock)
578 #define TREE_READ_UNLOCK_IRQ(mapping) spin_unlock_irq(&(mapping)->tree_lock)
581 /* returns the page unlocked, but with a reference */
582 static int ll_dir_readpage_20(struct file *file, struct page *page)
584 struct inode *inode = page->mapping->host;
585 struct ptlrpc_request *request;
586 struct mdt_body *body;
592 hash = hash_x_index(page->index);
593 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off %lu\n",
594 inode->i_ino, inode->i_generation, inode, (unsigned long)hash);
596 ll_inode2fid(&fid, inode);
597 rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &fid,
598 hash, page, &request);
600 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
602 /* Checked by mdc_readpage() */
603 LASSERT(body != NULL);
605 if (body->valid & OBD_MD_FLSIZE) {
606 ll_inode_size_lock(inode, 0);
607 i_size_write(inode, body->size);
608 ll_inode_size_unlock(inode, 0);
610 SetPageUptodate(page);
612 ptlrpc_req_finished(request);
620 static void ll_check_page(struct inode *dir, struct page *page)
622 /* XXX: check page format later */
623 SetPageChecked(page);
628 * Find, kmap and return page that contains given hash.
630 static struct page *ll_dir_page_locate(struct inode *dir, unsigned long hash,
631 __u64 *start, __u64 *end)
633 struct address_space *mapping = dir->i_mapping;
635 * Complement of hash is used as an index so that
636 * radix_tree_gang_lookup() can be used to find a page with starting
637 * hash _smaller_ than one we are looking for.
639 unsigned long offset = hash_x_index(hash);
644 TREE_READ_LOCK_IRQ(mapping);
645 found = radix_tree_gang_lookup(&mapping->page_tree,
646 (void **)&page, offset, 1);
648 struct lu_dirpage *dp;
650 page_cache_get(page);
651 TREE_READ_UNLOCK_IRQ(mapping);
653 * In contrast to find_lock_page() we are sure that directory
654 * page cannot be truncated (while DLM lock is held) and,
655 * hence, can avoid restart.
657 * In fact, page cannot be locked here at all, because
658 * ll_dir_readpage() does synchronous io.
661 if (PageUptodate(page)) {
663 *start = le64_to_cpu(dp->ldp_hash_start);
664 *end = le64_to_cpu(dp->ldp_hash_end);
665 LASSERT(*start <= hash);
666 if (hash > *end || (*end != *start && hash == *end)) {
669 ll_truncate_complete_page(page);
671 page_cache_release(page);
675 page_cache_release(page);
676 page = ERR_PTR(-EIO);
680 TREE_READ_UNLOCK_IRQ(mapping);
686 static struct page *ll_get_dir_page_20(struct inode *dir, __u64 hash, int exact,
687 struct ll_dir_chain *chain)
689 struct ldlm_res_id res_id;
690 struct lustre_handle lockh;
691 struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
692 struct address_space *mapping = dir->i_mapping;
693 struct lu_dirpage *dp;
695 ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
702 fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id);
704 rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
705 &res_id, LDLM_IBITS, &policy, mode, &lockh);
707 struct lookup_intent it = { .it_op = IT_READDIR };
708 struct ldlm_enqueue_info einfo = { LDLM_IBITS, mode,
709 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir };
710 struct ptlrpc_request *request;
711 struct mdc_op_data op_data = { { 0 } };
713 ll_prepare_mdc_op_data(&op_data, dir, NULL, NULL, 0, 0, NULL);
715 rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, &einfo, &it,
716 &op_data, &lockh, NULL, 0, 0);
718 request = (struct ptlrpc_request *)it.d.lustre.it_data;
720 ptlrpc_req_finished(request);
722 CERROR("lock enqueue: rc: %d\n", rc);
726 ldlm_lock_dump_handle(D_OTHER, &lockh);
728 page = ll_dir_page_locate(dir, hash, &start, &end);
730 GOTO(out_unlock, page);
734 * XXX nikita: not entirely correct handling of a corner case:
735 * suppose hash chain of entries with hash value HASH crosses
736 * border between pages P0 and P1. First both P0 and P1 are
737 * cached, seekdir() is called for some entry from the P0 part
738 * of the chain. Later P0 goes out of cache. telldir(HASH)
739 * happens and finds P1, as it starts with matching hash
740 * value. Remaining entries from P0 part of the chain are
741 * skipped. (Is that really a bug?)
743 * Possible solutions: 0. don't cache P1 is such case, handle
744 * it as an "overflow" page. 1. invalidate all pages at
745 * once. 2. use HASH|1 as an index for P1.
747 if (exact && hash != start) {
749 * readdir asked for a page starting _exactly_ from
750 * given hash, but cache contains stale page, with
751 * entries with smaller hash values. Stale page should
752 * be invalidated, and new one fetched.
754 CDEBUG(D_INFO, "Stale readpage page %p: %#lx != %#lx\n",
755 page, (unsigned long)hash, (unsigned long)start);
757 ll_truncate_complete_page(page);
759 page_cache_release(page);
761 GOTO(hash_collision, page);
765 page = read_cache_page(mapping, hash_x_index(hash),
766 (filler_t*)ll_dir_readpage_20, NULL);
768 GOTO(out_unlock, page);
772 if (!PageUptodate(page))
774 if (!PageChecked(page))
775 ll_check_page(dir, page);
779 dp = page_address(page);
781 start = le64_to_cpu(dp->ldp_hash_start);
782 end = le64_to_cpu(dp->ldp_hash_end);
784 LASSERT(start == hash);
785 CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end);
787 * Fetch whole overflow chain...
794 ldlm_lock_decref(&lockh, mode);
799 page = ERR_PTR(-EIO);
803 static int ll_readdir_20(struct file *filp, void *cookie, filldir_t filldir)
805 struct inode *inode = filp->f_dentry->d_inode;
806 __u64 pos = filp->f_pos;
807 struct ll_sb_info *sbi = ll_i2sbi(inode);
809 struct ll_dir_chain chain;
812 int shift,need_32bit;
816 need_32bit = ll_need_32bit_api(sbi);
818 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu 32bit_api %d\n",
819 inode->i_ino, inode->i_generation, inode,
820 (unsigned long)pos, i_size_read(inode), need_32bit);
822 if (pos == DIR_END_OFF)
831 ll_dir_chain_init(&chain);
833 page = ll_get_dir_page_20(inode, pos, 0, &chain);
835 while (rc == 0 && !done) {
836 struct lu_dirpage *dp;
837 struct lu_dirent *ent;
841 * If page is empty (end of directoryis reached),
844 __u64 hash = DIR_END_OFF;
847 dp = page_address(page);
848 for (ent = lu_dirent_start(dp); ent != NULL && !done;
849 ent = lu_dirent_next(ent)) {
855 hash = le64_to_cpu(ent->lde_hash);
856 namelen = le16_to_cpu(ent->lde_namelen);
860 * Skip until we find target hash
872 name = ent->lde_name;
873 fid_le_to_cpu(&fid, &fid);
875 ino = ll_fid_build_ino32((struct ll_fid *)&fid);
877 ino = ll_fid_build_ino((struct ll_fid *)&fid);
879 type = ll_dirent_type_get(ent);
880 done = filldir(cookie, name, namelen,
881 (loff_t)hash, ino, type);
883 next = le64_to_cpu(dp->ldp_hash_end);
887 if (pos == DIR_END_OFF)
889 * End of directory reached.
892 else if (1 /* chain is exhausted*/)
894 * Normal case: continue to the next
897 page = ll_get_dir_page_20(inode, pos, 1,
901 * go into overflow page.
909 CERROR("error reading dir "DFID" at %lu: rc %d\n",
910 PFID(ll_inode_lu_fid(inode)),
911 (unsigned long)pos, rc);
915 filp->f_pos = (loff_t)(__s32)pos;
916 filp->f_version = inode->i_version;
917 touch_atime(filp->f_vfsmnt, filp->f_dentry);
919 ll_dir_chain_fini(&chain);
924 static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
926 struct inode *inode = filp->f_dentry->d_inode;
927 struct ll_sb_info *sbi = ll_i2sbi(inode);
929 if (sbi->ll_mdc_exp->exp_connect_flags & OBD_CONNECT_FID) {
930 return ll_readdir_20(filp, cookie, filldir);
932 return ll_readdir_18(filp, cookie, filldir);
936 #define QCTL_COPY(out, in) \
938 Q_COPY(out, in, qc_cmd); \
939 Q_COPY(out, in, qc_type); \
940 Q_COPY(out, in, qc_id); \
941 Q_COPY(out, in, qc_stat); \
942 Q_COPY(out, in, qc_dqinfo); \
943 Q_COPY(out, in, qc_dqblk); \
946 static int ll_send_mgc_param(struct obd_export *mgc, char *string)
948 struct mgs_send_param *msp;
955 strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
956 rc = obd_set_info_async(mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
957 sizeof(struct mgs_send_param), msp, NULL);
959 CERROR("Failed to set parameter: %d\n", rc);
965 static char *ll_get_fsname(struct inode *inode)
967 struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
971 OBD_ALLOC(fsname, MGS_PARAM_MAXLEN);
972 len = strlen(lsi->lsi_lmd->lmd_profile);
973 ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
974 if (ptr && (strcmp(ptr, "-client") == 0))
976 strncpy(fsname, lsi->lsi_lmd->lmd_profile, len);
982 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
985 struct ll_sb_info *sbi = ll_i2sbi(inode);
986 struct mdc_op_data data = { { 0 } };
987 struct ptlrpc_request *req = NULL;
988 struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
989 struct obd_device *mgc = lsi->lsi_mgc;
990 char *fsname = NULL, *param = NULL;
991 struct iattr attr = { 0 };
992 int lum_size = 0, rc = 0;
995 if (lump->lmm_magic == LOV_USER_MAGIC_V3)
996 lum_size = sizeof(struct lov_user_md_v3);
998 lum_size = sizeof(struct lov_user_md_v1);
1000 * This is coming from userspace, so should be in
1001 * local endian. But the MDS would like it in little
1002 * endian, so we swab it before we send it.
1004 if ((lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1)) &&
1005 (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))) {
1006 rc = lustre_swab_lov_user_md(lump);
1010 } else { /* NULL value means remove LOV EA */
1011 lum_size = sizeof(struct lov_user_md_v1);
1014 ll_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0, NULL);
1016 /* swabbing is done in lov_setstripe() on server side */
1017 rc = mdc_setattr(sbi->ll_mdc_exp, &data,
1018 &attr, lump, lum_size, NULL, 0, &req);
1020 ptlrpc_req_finished(req);
1021 if (rc != -EPERM && rc != -EACCES)
1022 CERROR("mdc_setattr fails: rc = %d\n", rc);
1025 ptlrpc_req_finished(req);
1027 /* In the following we use the fact that LOV_USER_MAGIC_V1 and
1028 LOV_USER_MAGIC_V3 have the same initial fields so we do not
1029 need the make the distiction between the 2 versions */
1030 if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
1031 OBD_ALLOC(param, MGS_PARAM_MAXLEN);
1033 /* Get fsname and assume devname to be -MDT0000. */
1034 fsname = ll_get_fsname(inode);
1035 /* Set root stripesize */
1036 sprintf(param, "%s-MDT0000.lov.stripesize=%u", fsname,
1037 lump ? le32_to_cpu(lump->lmm_stripe_size) : 0);
1038 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
1042 /* Set root stripecount */
1043 sprintf(param, "%s-MDT0000.lov.stripecount=%u", fsname,
1044 lump ? le16_to_cpu(lump->lmm_stripe_count) : 0);
1045 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
1049 /* Set root stripeoffset */
1050 sprintf(param, "%s-MDT0000.lov.stripeoffset=%u", fsname,
1051 lump ? le16_to_cpu(lump->lmm_stripe_offset) :
1052 (typeof(lump->lmm_stripe_offset))(-1));
1053 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
1058 OBD_FREE(fsname, MGS_PARAM_MAXLEN);
1060 OBD_FREE(param, MGS_PARAM_MAXLEN);
1065 int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
1066 int *lmm_size, struct ptlrpc_request **request)
1068 struct ll_sb_info *sbi = ll_i2sbi(inode);
1070 struct mds_body *body;
1071 struct lov_mds_md *lmm = NULL;
1072 struct ptlrpc_request *req = NULL;
1075 ll_inode2fid(&fid, inode);
1077 rc = ll_get_max_mdsize(sbi, &lmmsize);
1081 rc = mdc_getattr(sbi->ll_mdc_exp, &fid,
1082 OBD_MD_FLEASIZE|OBD_MD_FLDIREA,
1085 CDEBUG(D_INFO, "mdc_getattr failed on inode "
1086 "%lu/%u: rc %d\n", inode->i_ino,
1087 inode->i_generation, rc);
1090 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
1092 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1093 /* swabbed by mdc_getattr_name */
1094 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
1096 lmmsize = body->eadatasize;
1098 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1100 GOTO(out, rc = -ENODATA);
1103 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1104 LASSERT(lmm != NULL);
1105 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
1108 * This is coming from the MDS, so is probably in
1109 * little endian. We convert it to host endian before
1110 * passing it to userspace.
1112 /* We don't swab objects for directories */
1113 if (((le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1) ||
1114 (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3)) &&
1115 (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))) {
1116 rc = lustre_swab_lov_user_md((struct lov_user_md*)lmm);
1123 *lmm_size = lmmsize;
1128 static int ll_dir_ioctl(struct inode *inode, struct file *file,
1129 unsigned int cmd, unsigned long arg)
1131 struct ll_sb_info *sbi = ll_i2sbi(inode);
1132 struct obd_ioctl_data *data;
1135 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n",
1136 inode->i_ino, inode->i_generation, inode, cmd);
1138 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1139 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1142 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1144 case FSFILT_IOC_GETFLAGS:
1145 case FSFILT_IOC_SETFLAGS:
1146 RETURN(ll_iocontrol(inode, file, cmd, arg));
1147 case FSFILT_IOC_GETVERSION_OLD:
1148 case FSFILT_IOC_GETVERSION:
1149 RETURN(put_user(inode->i_generation, (int *)arg));
1150 /* We need to special case any other ioctls we want to handle,
1151 * to send them to the MDS/OST as appropriate and to properly
1152 * network encode the arg field.
1153 case EXT3_IOC_SETVERSION_OLD:
1154 case EXT3_IOC_SETVERSION:
1156 case IOC_MDC_LOOKUP: {
1157 struct ptlrpc_request *request = NULL;
1161 int namelen, rc, len = 0;
1163 rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
1168 filename = data->ioc_inlbuf1;
1169 namelen = data->ioc_inllen1;
1172 CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
1173 GOTO(out, rc = -EINVAL);
1176 ll_inode2fid(&fid, inode);
1177 rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid, filename, namelen,
1178 OBD_MD_FLID, 0, &request);
1180 CDEBUG(D_INFO, "mdc_getattr_name: %d\n", rc);
1184 ptlrpc_req_finished(request);
1188 obd_ioctl_freedata(buf, len);
1191 case LL_IOC_LOV_SETSTRIPE: {
1192 struct lov_user_md_v3 lumv3;
1193 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1194 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1195 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1198 int set_default = 0;
1200 LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
1201 LASSERT(sizeof(lumv3.lmm_objects[0]) ==
1202 sizeof(lumv3p->lmm_objects[0]));
1204 /* first try with v1 which is smaller than v3 */
1205 if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
1208 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1209 if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
1213 if (inode->i_sb->s_root == file->f_dentry)
1216 /* in v1 and v3 cases lumv1 points to data */
1217 rc = ll_dir_setstripe(inode, lumv1, set_default);
1221 case LL_IOC_OBD_STATFS:
1222 RETURN(ll_obd_statfs(inode, (void *)arg));
1223 case LL_IOC_LOV_GETSTRIPE:
1224 case LL_IOC_MDC_GETINFO:
1225 case IOC_MDC_GETFILEINFO:
1226 case IOC_MDC_GETFILESTRIPE: {
1227 struct ptlrpc_request *request = NULL;
1228 struct mds_body *body;
1229 struct lov_user_md *lump;
1230 struct lov_mds_md *lmm = NULL;
1231 char *filename = NULL;
1234 if (cmd == IOC_MDC_GETFILEINFO ||
1235 cmd == IOC_MDC_GETFILESTRIPE) {
1236 filename = getname((const char *)arg);
1237 if (IS_ERR(filename))
1238 RETURN(PTR_ERR(filename));
1240 rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
1241 &lmmsize, &request);
1243 rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
1247 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
1249 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1250 /* swabbed by mdc_getattr_name */
1251 LASSERT(lustre_rep_swabbed(request, REPLY_REC_OFF));
1257 if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
1258 cmd == LL_IOC_MDC_GETINFO))
1259 GOTO(skip_lmm, rc = 0);
1264 if (cmd == IOC_MDC_GETFILESTRIPE ||
1265 cmd == LL_IOC_LOV_GETSTRIPE) {
1266 lump = (struct lov_user_md *)arg;
1268 struct lov_user_mds_data *lmdp;
1269 lmdp = (struct lov_user_mds_data *)arg;
1270 lump = &lmdp->lmd_lmm;
1272 if (copy_to_user(lump, lmm, lmmsize) != 0) {
1273 if (copy_to_user(lump, lmm, sizeof(*lump)) != 0)
1274 GOTO(out_lmm, rc = -EFAULT);
1278 if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
1279 struct lov_user_mds_data *lmdp;
1282 st.st_dev = inode->i_sb->s_dev;
1283 st.st_mode = body->mode;
1284 st.st_nlink = body->nlink;
1285 st.st_uid = body->uid;
1286 st.st_gid = body->gid;
1287 st.st_rdev = body->rdev;
1288 st.st_size = body->size;
1289 st.st_blksize = CFS_PAGE_SIZE;
1290 st.st_blocks = body->blocks;
1291 st.st_atime = body->atime;
1292 st.st_mtime = body->mtime;
1293 st.st_ctime = body->ctime;
1294 st.st_ino = body->ino;
1296 lmdp = (struct lov_user_mds_data *)arg;
1297 if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st)))
1298 GOTO(out_lmm, rc = -EFAULT);
1303 if (lmm && lmm->lmm_magic == LOV_MAGIC_JOIN)
1304 OBD_FREE(lmm, lmmsize);
1306 ptlrpc_req_finished(request);
1311 case IOC_LOV_GETINFO: {
1312 struct lov_user_mds_data *lumd;
1313 struct lov_stripe_md *lsm;
1314 struct lov_user_md *lum;
1315 struct lov_mds_md *lmm;
1320 lumd = (struct lov_user_mds_data *)arg;
1321 lum = &lumd->lmd_lmm;
1323 rc = ll_get_max_mdsize(sbi, &lmmsize);
1327 OBD_ALLOC(lmm, lmmsize);
1328 if (copy_from_user(lmm, lum, lmmsize))
1329 GOTO(free_lmm, rc = -EFAULT);
1331 if (LOV_USER_MAGIC != cpu_to_le32(LOV_USER_MAGIC)) {
1332 rc = lustre_swab_lov_user_md(
1333 (struct lov_user_md_v1 *)lmm);
1336 rc = lustre_swab_lov_user_md_objects(
1337 (struct lov_user_md*)lmm);
1342 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
1344 GOTO(free_lmm, rc = -ENOMEM);
1346 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
1350 /* Perform glimpse_size operation. */
1351 memset(&st, 0, sizeof(st));
1353 rc = ll_glimpse_ioctl(sbi, lsm, &st);
1357 if (copy_to_user(&lumd->lmd_st, &st, sizeof(st)))
1358 GOTO(free_lsm, rc = -EFAULT);
1362 obd_free_memmd(sbi->ll_osc_exp, &lsm);
1364 OBD_FREE(lmm, lmmsize);
1367 case OBD_IOC_LLOG_CATINFO: {
1368 struct ptlrpc_request *req = NULL;
1371 char *bufs[3] = { NULL }, *str;
1372 int lens[3] = { sizeof(struct ptlrpc_body) };
1373 int size[2] = { sizeof(struct ptlrpc_body) };
1375 rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
1380 if (!data->ioc_inlbuf1) {
1381 obd_ioctl_freedata(buf, len);
1385 lens[REQ_REC_OFF] = data->ioc_inllen1;
1386 bufs[REQ_REC_OFF] = data->ioc_inlbuf1;
1387 if (data->ioc_inllen2) {
1388 lens[REQ_REC_OFF + 1] = data->ioc_inllen2;
1389 bufs[REQ_REC_OFF + 1] = data->ioc_inlbuf2;
1391 lens[REQ_REC_OFF + 1] = 0;
1392 bufs[REQ_REC_OFF + 1] = NULL;
1395 req = ptlrpc_prep_req(sbi2mdc(sbi)->cl_import,
1396 LUSTRE_LOG_VERSION, LLOG_CATINFO, 3, lens,
1399 GOTO(out_catinfo, rc = -ENOMEM);
1401 size[REPLY_REC_OFF] = data->ioc_plen1;
1402 ptlrpc_req_set_repsize(req, 2, size);
1404 rc = ptlrpc_queue_wait(req);
1405 str = lustre_msg_string(req->rq_repmsg, REPLY_REC_OFF,
1408 if (copy_to_user(data->ioc_pbuf1, str,data->ioc_plen1))
1410 ptlrpc_req_finished(req);
1412 obd_ioctl_freedata(buf, len);
1415 case OBD_IOC_QUOTACHECK: {
1416 struct obd_quotactl *oqctl;
1419 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1422 OBD_ALLOC_PTR(oqctl);
1425 oqctl->qc_type = arg;
1426 rc = obd_quotacheck(sbi->ll_mdc_exp, oqctl);
1428 CDEBUG(D_INFO, "mdc_quotacheck failed: rc %d\n", rc);
1432 rc = obd_quotacheck(sbi->ll_osc_exp, oqctl);
1434 CDEBUG(D_INFO, "osc_quotacheck failed: rc %d\n", rc);
1436 OBD_FREE_PTR(oqctl);
1439 case OBD_IOC_POLL_QUOTACHECK: {
1440 struct if_quotacheck *check;
1443 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1446 OBD_ALLOC_PTR(check);
1450 rc = obd_iocontrol(cmd, sbi->ll_mdc_exp, 0, (void *)check,
1453 CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
1454 if (copy_to_user((void *)arg, check, sizeof(*check)))
1455 CDEBUG(D_QUOTA, "copy_to_user failed\n");
1459 rc = obd_iocontrol(cmd, sbi->ll_osc_exp, 0, (void *)check,
1462 CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
1463 if (copy_to_user((void *)arg, check, sizeof(*check)))
1464 CDEBUG(D_QUOTA, "copy_to_user failed\n");
1468 OBD_FREE_PTR(check);
1471 case OBD_IOC_QUOTACTL: {
1472 struct if_quotactl *qctl;
1473 struct obd_quotactl *oqctl;
1475 int cmd, type, id, rc = 0;
1477 OBD_ALLOC_PTR(qctl);
1481 OBD_ALLOC_PTR(oqctl);
1486 if (copy_from_user(qctl, (void *)arg, sizeof(*qctl)))
1487 GOTO(out_quotactl, rc = -EFAULT);
1490 type = qctl->qc_type;
1493 case LUSTRE_Q_INVALIDATE:
1494 case LUSTRE_Q_FINVALIDATE:
1499 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1500 GOTO(out_quotactl, rc = -EPERM);
1503 if (((type == USRQUOTA && cfs_curproc_euid() != id) ||
1504 (type == GRPQUOTA && !in_egroup_p(id))) &&
1505 !cfs_capable(CFS_CAP_SYS_ADMIN))
1506 GOTO(out_quotactl, rc = -EPERM);
1508 /* XXX: dqb_valid is borrowed as a flag to mark that
1509 * only mds quota is wanted */
1510 if (qctl->qc_dqblk.dqb_valid) {
1511 qctl->obd_uuid = sbi->ll_mdc_exp->exp_obd->
1512 u.cli.cl_target_uuid;
1513 qctl->qc_dqblk.dqb_valid = 0;
1520 CERROR("unsupported quotactl op: %#x\n", cmd);
1521 GOTO(out_quotactl, -ENOTTY);
1524 QCTL_COPY(oqctl, qctl);
1526 if (qctl->obd_uuid.uuid[0]) {
1527 struct obd_device *obd;
1528 struct obd_uuid *uuid = &qctl->obd_uuid;
1530 obd = class_find_client_notype(uuid,
1531 &sbi->ll_osc_exp->exp_obd->obd_uuid);
1533 GOTO(out_quotactl, rc = -ENOENT);
1535 if (cmd == Q_GETINFO)
1536 oqctl->qc_cmd = Q_GETOINFO;
1537 else if (cmd == Q_GETQUOTA)
1538 oqctl->qc_cmd = Q_GETOQUOTA;
1540 GOTO(out_quotactl, rc = -EINVAL);
1542 if (sbi->ll_mdc_exp->exp_obd == obd) {
1543 rc = obd_quotactl(sbi->ll_mdc_exp, oqctl);
1546 struct obd_export *exp;
1547 struct lov_obd *lov = &sbi->ll_osc_exp->
1550 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
1551 if (!lov->lov_tgts[i] ||
1552 !lov->lov_tgts[i]->ltd_active)
1554 exp = lov->lov_tgts[i]->ltd_exp;
1555 if (exp->exp_obd == obd) {
1556 rc = obd_quotactl(exp, oqctl);
1562 oqctl->qc_cmd = cmd;
1563 QCTL_COPY(qctl, oqctl);
1565 if (copy_to_user((void *)arg, qctl, sizeof(*qctl)))
1568 GOTO(out_quotactl, rc);
1571 rc = obd_quotactl(sbi->ll_mdc_exp, oqctl);
1572 if (rc && rc != -EBUSY && cmd == Q_QUOTAON) {
1573 oqctl->qc_cmd = Q_QUOTAOFF;
1574 obd_quotactl(sbi->ll_mdc_exp, oqctl);
1577 QCTL_COPY(qctl, oqctl);
1579 if (copy_to_user((void *)arg, qctl, sizeof(*qctl)))
1583 OBD_FREE_PTR(oqctl);
1586 case OBD_IOC_GETNAME_OLD:
1587 case OBD_IOC_GETNAME: {
1588 struct obd_device *obd = class_exp2obd(sbi->ll_osc_exp);
1591 if (copy_to_user((void *)arg, obd->obd_name,
1592 strlen(obd->obd_name) + 1))
1596 case LL_IOC_PATH2FID: {
1597 if (copy_to_user((void *)arg, ll_inode_lu_fid(inode),
1598 sizeof(struct lu_fid)))
1603 case LL_IOC_GET_CONNECT_FLAGS: {
1604 if (copy_to_user((void *)arg,
1605 &sbi->ll_mdc_exp->exp_connect_flags,
1611 RETURN(obd_iocontrol(cmd, sbi->ll_osc_exp,0,NULL,(void *)arg));
1615 struct file_operations ll_dir_operations = {
1616 .open = ll_file_open,
1617 .release = ll_file_release,
1618 .read = generic_read_dir,
1619 .readdir = ll_readdir,
1620 .ioctl = ll_dir_ioctl