1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Directory code for lustre client.
42 #include <linux/pagemap.h>
44 #include <linux/version.h>
45 #include <linux/smp_lock.h>
46 #include <asm/uaccess.h>
47 #include <linux/buffer_head.h> // for wait_on_buffer
49 #define DEBUG_SUBSYSTEM S_LLITE
51 #include <obd_support.h>
52 #include <obd_class.h>
53 #include <lustre_lib.h>
54 #include <lustre/lustre_idl.h>
55 #include <lustre_lite.h>
56 #include <lustre_dlm.h>
57 #include "llite_internal.h"
59 #ifndef HAVE_PAGE_CHECKED
60 #ifdef HAVE_PG_FS_MISC
61 #define PageChecked(page) test_bit(PG_fs_misc, &(page)->flags)
62 #define SetPageChecked(page) set_bit(PG_fs_misc, &(page)->flags)
64 #error PageChecked or PageFsMisc not defined in kernel
68 /* returns the page unlocked, but with a reference */
69 static int ll_dir_readpage(struct file *file, struct page *page)
71 struct inode *inode = page->mapping->host;
72 struct ll_fid mdc_fid;
74 struct ptlrpc_request *request;
75 struct mds_body *body;
79 offset = (__u64)page->index << CFS_PAGE_SHIFT;
80 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off "LPU64"\n",
81 inode->i_ino, inode->i_generation, inode, offset);
83 ll_pack_fid(&mdc_fid, inode->i_ino, inode->i_generation, S_IFDIR);
85 rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &mdc_fid,
86 offset, page, &request);
88 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
90 LASSERT(body != NULL); /* checked by mdc_readpage() */
91 /* swabbed by mdc_readpage() */
92 LASSERT(lustre_rep_swabbed(request, REPLY_REC_OFF));
94 if (body->size != i_size_read(inode)) {
95 ll_inode_size_lock(inode, 0);
96 i_size_write(inode, body->size);
97 ll_inode_size_unlock(inode, 0);
100 SetPageUptodate(page);
102 ptlrpc_req_finished(request);
109 struct address_space_operations ll_dir_aops = {
110 .readpage = ll_dir_readpage,
113 static inline unsigned ll_dir_page_mask(struct inode *inode)
115 return ~(inode->i_sb->s_blocksize - 1);
119 * Check consistency of a single entry.
121 static int ll_dir_check_entry(struct inode *dir, struct ll_dir_entry *ent,
122 unsigned offset, unsigned rec_len, pgoff_t index)
127 * Consider adding more checks.
130 if (unlikely(rec_len < ll_dir_rec_len(1)))
131 msg = "entry is too short";
132 else if (unlikely(rec_len & 3))
133 msg = "wrong alignment";
134 else if (unlikely(rec_len < ll_dir_rec_len(ent->lde_name_len)))
135 msg = "rec_len doesn't match name_len";
136 else if (unlikely(((offset + rec_len - 1) ^ offset) &
137 ll_dir_page_mask(dir)))
138 msg = "directory entry across blocks";
141 CERROR("%s: bad entry in directory %lu/%u: %s - "
142 "offset=%lu+%u, inode=%lu, rec_len=%d,"
143 " name_len=%d\n", ll_i2mdcexp(dir)->exp_obd->obd_name,
144 dir->i_ino, dir->i_generation, msg,
145 index << CFS_PAGE_SHIFT,
146 offset, (unsigned long)le32_to_cpu(ent->lde_inode),
147 rec_len, ent->lde_name_len);
151 static void ll_dir_check_page(struct inode *dir, struct page *page)
154 unsigned size = dir->i_sb->s_blocksize;
155 char *addr = page_address(page);
160 struct ll_dir_entry *ent;
163 if ((i_size_read(dir) >> CFS_PAGE_SHIFT) == (__u64)page->index) {
167 limit = i_size_read(dir) & ~CFS_PAGE_MASK;
168 if (limit & (size - 1)) {
169 CERROR("%s: dir %lu/%u size %llu doesn't match %u\n",
170 ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino,
171 dir->i_generation, i_size_read(dir), size);
175 * Place dummy forwarding entries to streamline
178 for (off = limit; off < CFS_PAGE_SIZE; off += size) {
179 ent = ll_entry_at(addr, off);
180 ent->lde_rec_len = cpu_to_le16(size);
181 ent->lde_name_len = 0;
186 limit = CFS_PAGE_SIZE;
189 !err && off <= limit - ll_dir_rec_len(1); off += reclen) {
190 ent = ll_entry_at(addr, off);
191 reclen = le16_to_cpu(ent->lde_rec_len);
192 err = ll_dir_check_entry(dir, ent, off, reclen, page->index);
195 if (!err && off != limit) {
196 ent = ll_entry_at(addr, off);
197 CERROR("%s: entry in directory %lu/%u spans the page boundary "
198 "offset="LPU64"+%u, inode=%lu\n",
199 ll_i2mdcexp(dir)->exp_obd->obd_name,
200 dir->i_ino, dir->i_generation,
201 (__u64)page->index << CFS_PAGE_SHIFT,
202 off, (unsigned long)le32_to_cpu(ent->lde_inode));
207 SetPageChecked(page);
210 struct page *ll_get_dir_page(struct inode *dir, unsigned long n)
212 struct ldlm_res_id res_id;
213 struct lustre_handle lockh;
214 struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
215 struct address_space *mapping = dir->i_mapping;
217 ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
220 fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id);
221 rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
222 &res_id, LDLM_IBITS, &policy, LCK_CR, &lockh);
224 struct lookup_intent it = { .it_op = IT_READDIR };
225 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CR,
226 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir };
227 struct ptlrpc_request *request;
228 struct mdc_op_data data = { { 0 } };
230 ll_prepare_mdc_op_data(&data, dir, NULL, NULL, 0, 0, NULL);
232 rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, &einfo, &it,
233 &data, &lockh, NULL, 0, 0);
235 request = (struct ptlrpc_request *)it.d.lustre.it_data;
237 ptlrpc_req_finished(request);
239 CERROR("lock enqueue: rc: %d\n", rc);
243 ldlm_lock_dump_handle(D_OTHER, &lockh);
245 page = read_cache_page(mapping, n,
246 (filler_t*)mapping->a_ops->readpage, NULL);
248 GOTO(out_unlock, page);
252 if (!PageUptodate(page))
254 if (!PageChecked(page))
255 ll_dir_check_page(dir, page);
260 ldlm_lock_decref(&lockh, LCK_CR);
265 page = ERR_PTR(-EIO);
269 static inline unsigned ll_dir_validate_entry(char *base, unsigned offset,
272 struct ll_dir_entry *de = ll_entry_at(base, offset);
273 struct ll_dir_entry *p = ll_entry_at(base, offset & mask);
274 while (p < de && p->lde_rec_len > 0)
275 p = ll_dir_next_entry(p);
276 return (char *)p - base;
280 * File type constants. The same as in ext2 for compatibility.
295 static unsigned char ll_dir_filetype_table[LL_DIR_FT_MAX] = {
296 [LL_DIR_FT_UNKNOWN] = DT_UNKNOWN,
297 [LL_DIR_FT_REG_FILE] = DT_REG,
298 [LL_DIR_FT_DIR] = DT_DIR,
299 [LL_DIR_FT_CHRDEV] = DT_CHR,
300 [LL_DIR_FT_BLKDEV] = DT_BLK,
301 [LL_DIR_FT_FIFO] = DT_FIFO,
302 [LL_DIR_FT_SOCK] = DT_SOCK,
303 [LL_DIR_FT_SYMLINK] = DT_LNK,
307 * Process one page. Returns:
309 * -ve: filldir commands readdir to stop.
310 * +ve: number of entries submitted to filldir.
311 * 0: no live entries on this page.
314 static int ll_readdir_page(char *addr, __u64 base, unsigned *offset,
315 filldir_t filldir, void *cookie)
317 struct ll_dir_entry *de;
321 de = ll_entry_at(addr, *offset);
322 end = addr + CFS_PAGE_SIZE - ll_dir_rec_len(1);
323 for (nr = 0 ;(char*)de <= end; de = ll_dir_next_entry(de)) {
324 if (de->lde_inode != 0) {
326 *offset = (char *)de - addr;
327 if (filldir(cookie, de->lde_name, de->lde_name_len,
328 base | *offset, le32_to_cpu(de->lde_inode),
329 ll_dir_filetype_table[de->lde_file_type &
330 (LL_DIR_FT_MAX - 1)]))
337 static int ll_readdir_18(struct file *filp, void *dirent, filldir_t filldir)
339 struct inode *inode = filp->f_dentry->d_inode;
340 loff_t pos = filp->f_pos;
341 unsigned offset = pos & ~CFS_PAGE_MASK;
342 pgoff_t idx = pos >> CFS_PAGE_SHIFT;
343 pgoff_t npages = dir_pages(inode);
344 unsigned chunk_mask = ll_dir_page_mask(inode);
345 int need_revalidate = (filp->f_version != inode->i_version);
347 int done; /* when this becomes negative --- stop iterating */
351 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %llu/%llu\n",
352 inode->i_ino, inode->i_generation, inode,
353 pos, i_size_read(inode));
356 * Checking ->i_size without the lock. Should be harmless, as server
359 if (pos > i_size_read(inode) - ll_dir_rec_len(1))
362 for (done = 0; idx < npages; idx++, offset = 0) {
364 * We can assume that all blocks on this page are filled with
365 * entries, because ll_dir_check_page() placed special dummy
372 CDEBUG(D_EXT2,"read %lu of dir %lu/%u page %lu/%lu "
374 CFS_PAGE_SIZE, inode->i_ino, inode->i_generation,
375 idx, npages, i_size_read(inode));
376 page = ll_get_dir_page(inode, idx);
378 /* size might have been updated by mdc_readpage */
379 npages = dir_pages(inode);
383 CERROR("error reading dir %lu/%u page %lu: rc %d\n",
384 inode->i_ino, inode->i_generation, idx, rc);
388 kaddr = page_address(page);
389 if (need_revalidate) {
391 * File offset was changed by lseek() and possibly
392 * points in the middle of an entry. Re-scan from the
393 * beginning of the chunk.
395 offset = ll_dir_validate_entry(kaddr, offset,
399 done = ll_readdir_page(kaddr, idx << CFS_PAGE_SHIFT,
400 &offset, filldir, dirent);
404 * Some entries were sent to the user space, return
410 * filldir is satisfied.
415 filp->f_pos = (idx << CFS_PAGE_SHIFT) | offset;
416 filp->f_version = inode->i_version;
417 touch_atime(filp->f_vfsmnt, filp->f_dentry);
423 * Chain of hash overflow pages.
425 struct ll_dir_chain {
426 /* XXX something. Later */
429 static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
433 static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
437 static inline __u32 hash_x_index(__u32 value)
439 return ((__u32)~0) - value;
443 * Layout of readdir pages, as transmitted on wire.
446 /** valid if LUDA_FID is set. */
447 struct lu_fid lde_fid;
448 /** a unique entry identifier: a hash or an offset. */
450 /** total record length, including all attributes. */
454 /** optional variable size attributes following this entry.
455 * taken from enum lu_dirent_attrs.
458 /** name is followed by the attributes indicated in ->ldp_attrs, in
459 * their natural order. After the last attribute, padding bytes are
460 * added to make ->lde_reclen a multiple of 8.
466 __u64 ldp_hash_start;
471 struct lu_dirent ldp_entries[0];
475 * Definitions of optional directory entry attributes formats.
477 * Individual attributes do not have their length encoded in a generic way. It
478 * is assumed that consumer of an attribute knows its format. This means that
479 * it is impossible to skip over an unknown attribute, except by skipping over all
480 * remaining attributes (by using ->lde_reclen), which is not too
481 * constraining, because new server versions will append new attributes at
482 * the end of an entry.
486 * Fid directory attribute: a fid of an object referenced by the entry. This
487 * will be almost always requested by the client and supplied by the server.
489 * Aligned to 8 bytes.
491 /* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
496 * Aligned to 2 bytes.
502 enum lu_dirpage_flags {
506 static inline int lu_dirent_calc_size(int namelen, __u16 attr)
510 if (attr & LUDA_TYPE) {
511 const unsigned align = sizeof(struct luda_type) - 1;
512 size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
513 size += sizeof(struct luda_type);
515 size = sizeof(struct lu_dirent) + namelen;
517 return (size + 7) & ~7;
521 * return IF_* type for given lu_dirent entry.
522 * IF_* flag shld be converted to particular OS file type in
523 * platform llite module.
525 __u16 ll_dirent_type_get(struct lu_dirent *ent)
528 struct luda_type *lt;
531 if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
532 const unsigned align = sizeof(struct luda_type) - 1;
534 len = le16_to_cpu(ent->lde_namelen);
535 len = (len + align) & ~align;
536 lt = (void *) ent->lde_name + len;
537 type = CFS_IFTODT(le16_to_cpu(lt->lt_type));
542 static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
544 if (le16_to_cpu(dp->ldp_flags) & LDF_EMPTY)
547 return dp->ldp_entries;
550 static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
552 struct lu_dirent *next;
554 if (le16_to_cpu(ent->lde_reclen) != 0)
555 next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
562 static inline int lu_dirent_size(struct lu_dirent *ent)
564 if (le16_to_cpu(ent->lde_reclen) == 0) {
565 return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen),
566 le32_to_cpu(ent->lde_attrs));
568 return le16_to_cpu(ent->lde_reclen);
571 #define DIR_END_OFF 0xfffffffffffffffeULL
573 #ifdef HAVE_RW_TREE_LOCK
574 #define TREE_READ_LOCK_IRQ(mapping) read_lock_irq(&(mapping)->tree_lock)
575 #define TREE_READ_UNLOCK_IRQ(mapping) read_unlock_irq(&(mapping)->tree_lock)
577 #define TREE_READ_LOCK_IRQ(mapping) spin_lock_irq(&(mapping)->tree_lock)
578 #define TREE_READ_UNLOCK_IRQ(mapping) spin_unlock_irq(&(mapping)->tree_lock)
581 /* returns the page unlocked, but with a reference */
582 static int ll_dir_readpage_20(struct file *file, struct page *page)
584 struct inode *inode = page->mapping->host;
585 struct ptlrpc_request *request;
586 struct mdt_body *body;
592 hash = hash_x_index(page->index);
593 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off %lu\n",
594 inode->i_ino, inode->i_generation, inode, (unsigned long)hash);
596 ll_inode2fid(&fid, inode);
597 rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &fid,
598 hash, page, &request);
600 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
602 /* Checked by mdc_readpage() */
603 LASSERT(body != NULL);
605 if (body->valid & OBD_MD_FLSIZE) {
606 ll_inode_size_lock(inode, 0);
607 i_size_write(inode, body->size);
608 ll_inode_size_unlock(inode, 0);
610 SetPageUptodate(page);
612 ptlrpc_req_finished(request);
620 static void ll_check_page(struct inode *dir, struct page *page)
622 /* XXX: check page format later */
623 SetPageChecked(page);
628 * Find, kmap and return page that contains given hash.
630 static struct page *ll_dir_page_locate(struct inode *dir, unsigned long hash,
631 __u64 *start, __u64 *end)
633 struct address_space *mapping = dir->i_mapping;
635 * Complement of hash is used as an index so that
636 * radix_tree_gang_lookup() can be used to find a page with starting
637 * hash _smaller_ than one we are looking for.
639 unsigned long offset = hash_x_index(hash);
644 TREE_READ_LOCK_IRQ(mapping);
645 found = radix_tree_gang_lookup(&mapping->page_tree,
646 (void **)&page, offset, 1);
648 struct lu_dirpage *dp;
650 page_cache_get(page);
651 TREE_READ_UNLOCK_IRQ(mapping);
653 * In contrast to find_lock_page() we are sure that directory
654 * page cannot be truncated (while DLM lock is held) and,
655 * hence, can avoid restart.
657 * In fact, page cannot be locked here at all, because
658 * ll_dir_readpage() does synchronous io.
661 if (PageUptodate(page)) {
663 *start = le64_to_cpu(dp->ldp_hash_start);
664 *end = le64_to_cpu(dp->ldp_hash_end);
665 LASSERT(*start <= hash);
666 if (hash > *end || (*end != *start && hash == *end)) {
669 ll_truncate_complete_page(page);
671 page_cache_release(page);
675 page_cache_release(page);
676 page = ERR_PTR(-EIO);
680 TREE_READ_UNLOCK_IRQ(mapping);
686 static struct page *ll_get_dir_page_20(struct inode *dir, __u64 hash, int exact,
687 struct ll_dir_chain *chain)
689 struct ldlm_res_id res_id;
690 struct lustre_handle lockh;
691 struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
692 struct address_space *mapping = dir->i_mapping;
693 struct lu_dirpage *dp;
695 ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
702 fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id);
704 rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
705 &res_id, LDLM_IBITS, &policy, mode, &lockh);
707 struct lookup_intent it = { .it_op = IT_READDIR };
708 struct ldlm_enqueue_info einfo = { LDLM_IBITS, mode,
709 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir };
710 struct ptlrpc_request *request;
711 struct mdc_op_data op_data = { { 0 } };
713 ll_prepare_mdc_op_data(&op_data, dir, NULL, NULL, 0, 0, NULL);
715 rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, &einfo, &it,
716 &op_data, &lockh, NULL, 0, 0);
718 request = (struct ptlrpc_request *)it.d.lustre.it_data;
720 ptlrpc_req_finished(request);
722 CERROR("lock enqueue: rc: %d\n", rc);
726 ldlm_lock_dump_handle(D_OTHER, &lockh);
728 page = ll_dir_page_locate(dir, hash, &start, &end);
730 GOTO(out_unlock, page);
734 * XXX nikita: not entirely correct handling of a corner case:
735 * suppose hash chain of entries with hash value HASH crosses
736 * border between pages P0 and P1. First both P0 and P1 are
737 * cached, seekdir() is called for some entry from the P0 part
738 * of the chain. Later P0 goes out of cache. telldir(HASH)
739 * happens and finds P1, as it starts with matching hash
740 * value. Remaining entries from P0 part of the chain are
741 * skipped. (Is that really a bug?)
743 * Possible solutions: 0. don't cache P1 is such case, handle
744 * it as an "overflow" page. 1. invalidate all pages at
745 * once. 2. use HASH|1 as an index for P1.
747 if (exact && hash != start) {
749 * readdir asked for a page starting _exactly_ from
750 * given hash, but cache contains stale page, with
751 * entries with smaller hash values. Stale page should
752 * be invalidated, and new one fetched.
754 CDEBUG(D_INFO, "Stale readpage page %p: %#lx != %#lx\n",
755 page, (unsigned long)hash, (unsigned long)start);
757 ll_truncate_complete_page(page);
759 page_cache_release(page);
761 GOTO(hash_collision, page);
765 page = read_cache_page(mapping, hash_x_index(hash),
766 (filler_t*)ll_dir_readpage_20, NULL);
768 GOTO(out_unlock, page);
772 if (!PageUptodate(page))
774 if (!PageChecked(page))
775 ll_check_page(dir, page);
779 dp = page_address(page);
781 start = le64_to_cpu(dp->ldp_hash_start);
782 end = le64_to_cpu(dp->ldp_hash_end);
784 LASSERT(start == hash);
785 CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end);
787 * Fetch whole overflow chain...
794 ldlm_lock_decref(&lockh, mode);
799 page = ERR_PTR(-EIO);
803 static int ll_readdir_20(struct file *filp, void *cookie, filldir_t filldir)
805 struct inode *inode = filp->f_dentry->d_inode;
806 struct ll_sb_info *sbi = ll_i2sbi(inode);
807 __u64 pos = filp->f_pos;
809 struct ll_dir_chain chain;
816 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu\n",
817 inode->i_ino, inode->i_generation, inode,
818 (unsigned long)pos, i_size_read(inode));
820 if (pos == DIR_END_OFF)
829 ll_dir_chain_init(&chain);
831 page = ll_get_dir_page_20(inode, pos, 0, &chain);
833 while (rc == 0 && !done) {
834 struct lu_dirpage *dp;
835 struct lu_dirent *ent;
839 * If page is empty (end of directoryis reached),
842 __u64 hash = DIR_END_OFF;
845 dp = page_address(page);
846 for (ent = lu_dirent_start(dp); ent != NULL && !done;
847 ent = lu_dirent_next(ent)) {
853 hash = le64_to_cpu(ent->lde_hash);
854 namelen = le16_to_cpu(ent->lde_namelen);
858 * Skip until we find target hash
870 name = ent->lde_name;
871 fid_le_to_cpu(&fid, &fid);
872 ino = ll_fid_build_ino(sbi, (struct ll_fid*)&fid);
873 type = ll_dirent_type_get(ent);
874 done = filldir(cookie, name, namelen,
875 (loff_t)hash, ino, type);
877 next = le64_to_cpu(dp->ldp_hash_end);
881 if (pos == DIR_END_OFF)
883 * End of directory reached.
886 else if (1 /* chain is exhausted*/)
888 * Normal case: continue to the next
891 page = ll_get_dir_page_20(inode, pos, 1,
895 * go into overflow page.
903 CERROR("error reading dir "DFID" at %lu: rc %d\n",
904 PFID(ll_inode_lu_fid(inode)),
905 (unsigned long)pos, rc);
909 filp->f_pos = (loff_t)(__s32)pos;
910 filp->f_version = inode->i_version;
911 touch_atime(filp->f_vfsmnt, filp->f_dentry);
913 ll_dir_chain_fini(&chain);
918 static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
920 struct inode *inode = filp->f_dentry->d_inode;
921 struct ll_sb_info *sbi = ll_i2sbi(inode);
923 if (sbi->ll_mdc_exp->exp_connect_flags & OBD_CONNECT_FID) {
924 return ll_readdir_20(filp, cookie, filldir);
926 return ll_readdir_18(filp, cookie, filldir);
930 #define QCTL_COPY(out, in) \
932 Q_COPY(out, in, qc_cmd); \
933 Q_COPY(out, in, qc_type); \
934 Q_COPY(out, in, qc_id); \
935 Q_COPY(out, in, qc_stat); \
936 Q_COPY(out, in, qc_dqinfo); \
937 Q_COPY(out, in, qc_dqblk); \
940 static int ll_send_mgc_param(struct obd_export *mgc, char *string)
942 struct mgs_send_param *msp;
949 strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
950 rc = obd_set_info_async(mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
951 sizeof(struct mgs_send_param), msp, NULL);
953 CERROR("Failed to set parameter: %d\n", rc);
959 static char *ll_get_fsname(struct inode *inode)
961 struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
965 OBD_ALLOC(fsname, MGS_PARAM_MAXLEN);
966 len = strlen(lsi->lsi_lmd->lmd_profile);
967 ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
968 if (ptr && (strcmp(ptr, "-client") == 0))
970 strncpy(fsname, lsi->lsi_lmd->lmd_profile, len);
976 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
979 struct ll_sb_info *sbi = ll_i2sbi(inode);
980 struct mdc_op_data data = { { 0 } };
981 struct ptlrpc_request *req = NULL;
982 struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
983 struct obd_device *mgc = lsi->lsi_mgc;
984 char *fsname = NULL, *param = NULL;
985 int lum_size = sizeof(struct lov_user_md_v1);
987 struct iattr attr = { 0 };
990 if (lump->lmm_magic == LOV_USER_MAGIC_V3)
991 lum_size = sizeof(struct lov_user_md_v3);
993 * This is coming from userspace, so should be in
994 * local endian. But the MDS would like it in little
995 * endian, so we swab it before we send it.
997 if ((lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1)) &&
998 (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))) {
999 rc = lustre_swab_lov_user_md(lump);
1004 ll_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0, NULL);
1006 /* swabbing is done in lov_setstripe() on server side */
1007 rc = mdc_setattr(sbi->ll_mdc_exp, &data,
1008 &attr, lump, lum_size, NULL, 0, &req);
1010 ptlrpc_req_finished(req);
1011 if (rc != -EPERM && rc != -EACCES)
1012 CERROR("mdc_setattr fails: rc = %d\n", rc);
1015 ptlrpc_req_finished(req);
1017 /* In the following we use the fact that LOV_USER_MAGIC_V1 and
1018 LOV_USER_MAGIC_V3 have the same initial fields so we do not
1019 need the make the distiction between the 2 versions */
1020 if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
1021 OBD_ALLOC(param, MGS_PARAM_MAXLEN);
1023 /* Get fsname and assume devname to be -MDT0000. */
1024 fsname = ll_get_fsname(inode);
1025 /* Set root stripesize */
1026 sprintf(param, "%s-MDT0000.lov.stripesize=%u", fsname,
1027 le32_to_cpu(lump->lmm_stripe_size));
1028 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
1032 /* Set root stripecount */
1033 sprintf(param, "%s-MDT0000.lov.stripecount=%u", fsname,
1034 le16_to_cpu(lump->lmm_stripe_count));
1035 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
1039 /* Set root stripeoffset */
1040 sprintf(param, "%s-MDT0000.lov.stripeoffset=%u", fsname,
1041 le16_to_cpu(lump->lmm_stripe_offset));
1042 rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
1047 OBD_FREE(fsname, MGS_PARAM_MAXLEN);
1049 OBD_FREE(param, MGS_PARAM_MAXLEN);
1054 int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
1055 int *lmm_size, struct ptlrpc_request **request)
1057 struct ll_sb_info *sbi = ll_i2sbi(inode);
1059 struct mds_body *body;
1060 struct lov_mds_md *lmm = NULL;
1061 struct ptlrpc_request *req = NULL;
1064 ll_inode2fid(&fid, inode);
1066 rc = ll_get_max_mdsize(sbi, &lmmsize);
1070 rc = mdc_getattr(sbi->ll_mdc_exp, &fid,
1071 OBD_MD_FLEASIZE|OBD_MD_FLDIREA,
1074 CDEBUG(D_INFO, "mdc_getattr failed on inode "
1075 "%lu/%u: rc %d\n", inode->i_ino,
1076 inode->i_generation, rc);
1079 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
1081 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1082 /* swabbed by mdc_getattr_name */
1083 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
1085 lmmsize = body->eadatasize;
1087 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1089 GOTO(out, rc = -ENODATA);
1092 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1093 LASSERT(lmm != NULL);
1094 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
1097 * This is coming from the MDS, so is probably in
1098 * little endian. We convert it to host endian before
1099 * passing it to userspace.
1101 /* We don't swab objects for directories */
1102 if (((le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V1) ||
1103 (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3)) &&
1104 (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))) {
1105 rc = lustre_swab_lov_user_md((struct lov_user_md*)lmm);
1112 *lmm_size = lmmsize;
1117 static int ll_dir_ioctl(struct inode *inode, struct file *file,
1118 unsigned int cmd, unsigned long arg)
1120 struct ll_sb_info *sbi = ll_i2sbi(inode);
1121 struct obd_ioctl_data *data;
1124 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n",
1125 inode->i_ino, inode->i_generation, inode, cmd);
1127 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1128 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1131 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1133 case FSFILT_IOC_GETFLAGS:
1134 case FSFILT_IOC_SETFLAGS:
1135 RETURN(ll_iocontrol(inode, file, cmd, arg));
1136 case FSFILT_IOC_GETVERSION_OLD:
1137 case FSFILT_IOC_GETVERSION:
1138 RETURN(put_user(inode->i_generation, (int *)arg));
1139 /* We need to special case any other ioctls we want to handle,
1140 * to send them to the MDS/OST as appropriate and to properly
1141 * network encode the arg field.
1142 case EXT3_IOC_SETVERSION_OLD:
1143 case EXT3_IOC_SETVERSION:
1145 case IOC_MDC_LOOKUP: {
1146 struct ptlrpc_request *request = NULL;
1150 int namelen, rc, len = 0;
1152 rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
1157 filename = data->ioc_inlbuf1;
1158 namelen = data->ioc_inllen1;
1161 CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
1162 GOTO(out, rc = -EINVAL);
1165 ll_inode2fid(&fid, inode);
1166 rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid, filename, namelen,
1167 OBD_MD_FLID, 0, &request);
1169 CDEBUG(D_INFO, "mdc_getattr_name: %d\n", rc);
1173 ptlrpc_req_finished(request);
1177 obd_ioctl_freedata(buf, len);
1180 case LL_IOC_LOV_SETSTRIPE: {
1181 struct lov_user_md_v3 lumv3;
1182 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1183 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1184 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1187 int set_default = 0;
1189 LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
1190 LASSERT(sizeof(lumv3.lmm_objects[0]) ==
1191 sizeof(lumv3p->lmm_objects[0]));
1193 /* first try with v1 which is smaller than v3 */
1194 rc = copy_from_user(lumv1, lumv1p, sizeof(*lumv1));
1198 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1199 rc = copy_from_user(&lumv3, lumv3p, sizeof(lumv3));
1204 if (inode->i_sb->s_root == file->f_dentry)
1207 /* in v1 and v3 cases lumv1 points to data */
1208 rc = ll_dir_setstripe(inode, lumv1, set_default);
1212 case LL_IOC_OBD_STATFS:
1213 RETURN(ll_obd_statfs(inode, (void *)arg));
1214 case LL_IOC_LOV_GETSTRIPE:
1215 case LL_IOC_MDC_GETINFO:
1216 case IOC_MDC_GETFILEINFO:
1217 case IOC_MDC_GETFILESTRIPE: {
1218 struct ptlrpc_request *request = NULL;
1219 struct mds_body *body;
1220 struct lov_user_md *lump;
1221 struct lov_mds_md *lmm = NULL;
1222 char *filename = NULL;
1225 if (cmd == IOC_MDC_GETFILEINFO ||
1226 cmd == IOC_MDC_GETFILESTRIPE) {
1227 filename = getname((const char *)arg);
1228 if (IS_ERR(filename))
1229 RETURN(PTR_ERR(filename));
1231 rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
1232 &lmmsize, &request);
1234 rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
1238 body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF,
1240 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1241 /* swabbed by mdc_getattr_name */
1242 LASSERT(lustre_rep_swabbed(request, REPLY_REC_OFF));
1248 if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
1249 cmd == LL_IOC_MDC_GETINFO))
1250 GOTO(skip_lmm, rc = 0);
1255 if (cmd == IOC_MDC_GETFILESTRIPE ||
1256 cmd == LL_IOC_LOV_GETSTRIPE) {
1257 lump = (struct lov_user_md *)arg;
1259 struct lov_user_mds_data *lmdp;
1260 lmdp = (struct lov_user_mds_data *)arg;
1261 lump = &lmdp->lmd_lmm;
1263 rc = copy_to_user(lump, lmm, lmmsize);
1265 GOTO(out_lmm, rc = -EFAULT);
1267 if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
1268 struct lov_user_mds_data *lmdp;
1271 st.st_dev = inode->i_sb->s_dev;
1272 st.st_mode = body->mode;
1273 st.st_nlink = body->nlink;
1274 st.st_uid = body->uid;
1275 st.st_gid = body->gid;
1276 st.st_rdev = body->rdev;
1277 st.st_size = body->size;
1278 st.st_blksize = CFS_PAGE_SIZE;
1279 st.st_blocks = body->blocks;
1280 st.st_atime = body->atime;
1281 st.st_mtime = body->mtime;
1282 st.st_ctime = body->ctime;
1283 st.st_ino = body->ino;
1285 lmdp = (struct lov_user_mds_data *)arg;
1286 rc = copy_to_user(&lmdp->lmd_st, &st, sizeof(st));
1288 GOTO(out_lmm, rc = -EFAULT);
1293 if (lmm && lmm->lmm_magic == LOV_MAGIC_JOIN)
1294 OBD_FREE(lmm, lmmsize);
1296 ptlrpc_req_finished(request);
1301 case IOC_LOV_GETINFO: {
1302 struct lov_user_mds_data *lumd;
1303 struct lov_stripe_md *lsm;
1304 struct lov_user_md *lum;
1305 struct lov_mds_md *lmm;
1310 lumd = (struct lov_user_mds_data *)arg;
1311 lum = &lumd->lmd_lmm;
1313 rc = ll_get_max_mdsize(sbi, &lmmsize);
1317 OBD_ALLOC(lmm, lmmsize);
1318 rc = copy_from_user(lmm, lum, lmmsize);
1320 GOTO(free_lmm, rc = -EFAULT);
1322 if (LOV_USER_MAGIC != cpu_to_le32(LOV_USER_MAGIC)) {
1323 rc = lustre_swab_lov_user_md(
1324 (struct lov_user_md_v1 *)lmm);
1327 rc = lustre_swab_lov_user_md_objects(
1328 (struct lov_user_md*)lmm);
1333 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
1335 GOTO(free_lmm, rc = -ENOMEM);
1337 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
1341 /* Perform glimpse_size operation. */
1342 memset(&st, 0, sizeof(st));
1344 rc = ll_glimpse_ioctl(sbi, lsm, &st);
1348 rc = copy_to_user(&lumd->lmd_st, &st, sizeof(st));
1350 GOTO(free_lsm, rc = -EFAULT);
1354 obd_free_memmd(sbi->ll_osc_exp, &lsm);
1356 OBD_FREE(lmm, lmmsize);
1359 case OBD_IOC_LLOG_CATINFO: {
1360 struct ptlrpc_request *req = NULL;
1363 char *bufs[3] = { NULL }, *str;
1364 int lens[3] = { sizeof(struct ptlrpc_body) };
1365 int size[2] = { sizeof(struct ptlrpc_body) };
1367 rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
1372 if (!data->ioc_inlbuf1) {
1373 obd_ioctl_freedata(buf, len);
1377 lens[REQ_REC_OFF] = data->ioc_inllen1;
1378 bufs[REQ_REC_OFF] = data->ioc_inlbuf1;
1379 if (data->ioc_inllen2) {
1380 lens[REQ_REC_OFF + 1] = data->ioc_inllen2;
1381 bufs[REQ_REC_OFF + 1] = data->ioc_inlbuf2;
1383 lens[REQ_REC_OFF + 1] = 0;
1384 bufs[REQ_REC_OFF + 1] = NULL;
1387 req = ptlrpc_prep_req(sbi2mdc(sbi)->cl_import,
1388 LUSTRE_LOG_VERSION, LLOG_CATINFO, 3, lens,
1391 GOTO(out_catinfo, rc = -ENOMEM);
1393 size[REPLY_REC_OFF] = data->ioc_plen1;
1394 ptlrpc_req_set_repsize(req, 2, size);
1396 rc = ptlrpc_queue_wait(req);
1397 str = lustre_msg_string(req->rq_repmsg, REPLY_REC_OFF,
1400 rc = copy_to_user(data->ioc_pbuf1, str,data->ioc_plen1);
1401 ptlrpc_req_finished(req);
1403 obd_ioctl_freedata(buf, len);
1406 case OBD_IOC_QUOTACHECK: {
1407 struct obd_quotactl *oqctl;
1410 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1413 OBD_ALLOC_PTR(oqctl);
1416 oqctl->qc_type = arg;
1417 rc = obd_quotacheck(sbi->ll_mdc_exp, oqctl);
1419 CDEBUG(D_INFO, "mdc_quotacheck failed: rc %d\n", rc);
1423 rc = obd_quotacheck(sbi->ll_osc_exp, oqctl);
1425 CDEBUG(D_INFO, "osc_quotacheck failed: rc %d\n", rc);
1427 OBD_FREE_PTR(oqctl);
1430 case OBD_IOC_POLL_QUOTACHECK: {
1431 struct if_quotacheck *check;
1434 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1437 OBD_ALLOC_PTR(check);
1441 rc = obd_iocontrol(cmd, sbi->ll_mdc_exp, 0, (void *)check,
1444 CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
1445 if (copy_to_user((void *)arg, check, sizeof(*check)))
1450 rc = obd_iocontrol(cmd, sbi->ll_osc_exp, 0, (void *)check,
1453 CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
1454 if (copy_to_user((void *)arg, check, sizeof(*check)))
1459 OBD_FREE_PTR(check);
1462 case OBD_IOC_QUOTACTL: {
1463 struct if_quotactl *qctl;
1464 struct obd_quotactl *oqctl;
1466 int cmd, type, id, rc = 0;
1468 OBD_ALLOC_PTR(qctl);
1472 OBD_ALLOC_PTR(oqctl);
1477 if (copy_from_user(qctl, (void *)arg, sizeof(*qctl)))
1478 GOTO(out_quotactl, rc = -EFAULT);
1481 type = qctl->qc_type;
1484 case LUSTRE_Q_INVALIDATE:
1485 case LUSTRE_Q_FINVALIDATE:
1490 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1491 GOTO(out_quotactl, rc = -EPERM);
1494 if (((type == USRQUOTA && cfs_curproc_euid() != id) ||
1495 (type == GRPQUOTA && !in_egroup_p(id))) &&
1496 !cfs_capable(CFS_CAP_SYS_ADMIN))
1497 GOTO(out_quotactl, rc = -EPERM);
1499 /* XXX: dqb_valid is borrowed as a flag to mark that
1500 * only mds quota is wanted */
1501 if (qctl->qc_dqblk.dqb_valid) {
1502 qctl->obd_uuid = sbi->ll_mdc_exp->exp_obd->
1503 u.cli.cl_target_uuid;
1504 qctl->qc_dqblk.dqb_valid = 0;
1511 CERROR("unsupported quotactl op: %#x\n", cmd);
1512 GOTO(out_quotactl, -ENOTTY);
1515 QCTL_COPY(oqctl, qctl);
1517 if (qctl->obd_uuid.uuid[0]) {
1518 struct obd_device *obd;
1519 struct obd_uuid *uuid = &qctl->obd_uuid;
1521 obd = class_find_client_notype(uuid,
1522 &sbi->ll_osc_exp->exp_obd->obd_uuid);
1524 GOTO(out_quotactl, rc = -ENOENT);
1526 if (cmd == Q_GETINFO)
1527 oqctl->qc_cmd = Q_GETOINFO;
1528 else if (cmd == Q_GETQUOTA)
1529 oqctl->qc_cmd = Q_GETOQUOTA;
1531 GOTO(out_quotactl, rc = -EINVAL);
1533 if (sbi->ll_mdc_exp->exp_obd == obd) {
1534 rc = obd_quotactl(sbi->ll_mdc_exp, oqctl);
1537 struct obd_export *exp;
1538 struct lov_obd *lov = &sbi->ll_osc_exp->
1541 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
1542 if (!lov->lov_tgts[i] ||
1543 !lov->lov_tgts[i]->ltd_active)
1545 exp = lov->lov_tgts[i]->ltd_exp;
1546 if (exp->exp_obd == obd) {
1547 rc = obd_quotactl(exp, oqctl);
1553 oqctl->qc_cmd = cmd;
1554 QCTL_COPY(qctl, oqctl);
1556 if (copy_to_user((void *)arg, qctl, sizeof(*qctl)))
1559 GOTO(out_quotactl, rc);
1562 rc = obd_quotactl(sbi->ll_mdc_exp, oqctl);
1563 if (rc && rc != -EBUSY && cmd == Q_QUOTAON) {
1564 oqctl->qc_cmd = Q_QUOTAOFF;
1565 obd_quotactl(sbi->ll_mdc_exp, oqctl);
1568 QCTL_COPY(qctl, oqctl);
1570 if (copy_to_user((void *)arg, qctl, sizeof(*qctl)))
1574 OBD_FREE_PTR(oqctl);
1577 case OBD_IOC_GETNAME_OLD:
1578 case OBD_IOC_GETNAME: {
1579 struct obd_device *obd = class_exp2obd(sbi->ll_osc_exp);
1582 if (copy_to_user((void *)arg, obd->obd_name,
1583 strlen(obd->obd_name) + 1))
1587 case LL_IOC_PATH2FID: {
1588 if (copy_to_user((void *)arg, ll_inode_lu_fid(inode),
1589 sizeof(struct lu_fid)))
1595 RETURN(obd_iocontrol(cmd, sbi->ll_osc_exp,0,NULL,(void *)arg));
1599 struct file_operations ll_dir_operations = {
1600 .open = ll_file_open,
1601 .release = ll_file_release,
1602 .read = generic_read_dir,
1603 .readdir = ll_readdir,
1604 .ioctl = ll_dir_ioctl