Whamcloud - gitweb
LU-17276 ldlm: use interval tree for searching in flock
[fs/lustre-release.git] / lustre / llite / dir.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  *
31  * lustre/llite/dir.c
32  *
33  * Directory code for lustre client.
34  */
35
36 #include <linux/fs.h>
37 #include <linux/pagemap.h>
38 #include <linux/mm.h>
39 #include <linux/version.h>
40 #include <linux/security.h>
41 #include <linux/user_namespace.h>
42 #include <linux/uidgid.h>
43 #include <linux/uaccess.h>
44 #include <linux/buffer_head.h>   // for wait_on_buffer
45 #include <linux/pagevec.h>
46
47 #define DEBUG_SUBSYSTEM S_LLITE
48
49 #include <obd_support.h>
50 #include <obd_class.h>
51 #include <uapi/linux/lustre/lustre_ioctl.h>
52 #include <lustre_lib.h>
53 #include <lustre_dlm.h>
54 #include <lustre_compat.h>
55 #include <lustre_fid.h>
56 #include <lustre_kernelcomm.h>
57 #include <lustre_swab.h>
58 #include <lustre_quota.h>
59
60 #include "llite_internal.h"
61
62 /*
63  * (new) readdir implementation overview.
64  *
65  * Original lustre readdir implementation cached exact copy of raw directory
66  * pages on the client. These pages were indexed in client page cache by
67  * logical offset in the directory file. This design, while very simple and
68  * intuitive had some inherent problems:
69  *
70  *     . it implies that byte offset to the directory entry serves as a
71  *     telldir(3)/seekdir(3) cookie, but that offset is not stable: in
72  *     ext3/htree directory entries may move due to splits, and more
73  *     importantly,
74  *
75  *     . it is incompatible with the design of split directories for cmd3,
76  *     that assumes that names are distributed across nodes based on their
77  *     hash, and so readdir should be done in hash order.
78  *
79  * New readdir implementation does readdir in hash order, and uses hash of a
80  * file name as a telldir/seekdir cookie. This led to number of complications:
81  *
82  *     . hash is not unique, so it cannot be used to index cached directory
83  *     pages on the client (note, that it requires a whole pageful of hash
84  *     collided entries to cause two pages to have identical hashes);
85  *
86  *     . hash is not unique, so it cannot, strictly speaking, be used as an
87  *     entry cookie. ext3/htree has the same problem and lustre implementation
88  *     mimics their solution: seekdir(hash) positions directory at the first
89  *     entry with the given hash.
90  *
91  * Client side.
92  *
93  * 0. caching
94  *
95  * Client caches directory pages using hash of the first entry as an index. As
96  * noted above hash is not unique, so this solution doesn't work as is:
97  * special processing is needed for "page hash chains" (i.e., sequences of
98  * pages filled with entries all having the same hash value).
99  *
100  * First, such chains have to be detected. To this end, server returns to the
101  * client the hash of the first entry on the page next to one returned. When
102  * client detects that this hash is the same as hash of the first entry on the
103  * returned page, page hash collision has to be handled. Pages in the
104  * hash chain, except first one, are termed "overflow pages".
105  *
106  * Proposed (unimplimented) solution to index uniqueness problem is to
107  * not cache overflow pages.  Instead, when page hash collision is
108  * detected, all overflow pages from emerging chain should be
109  * immediately requested from the server and placed in a special data
110  * structure.  This data structure can be used by ll_readdir() to
111  * process entries from overflow pages.  When readdir invocation
112  * finishes, overflow pages are discarded.  If page hash collision chain
113  * weren't completely processed, next call to readdir will again detect
114  * page hash collision, again read overflow pages in, process next
115  * portion of entries and again discard the pages.  This is not as
116  * wasteful as it looks, because, given reasonable hash, page hash
117  * collisions are extremely rare.
118  *
119  * 1. directory positioning
120  *
121  * When seekdir(hash) is called.
122  *
123  * seekdir() sets the location in the directory stream from which the next
124  * readdir() call will start. mdc_page_locate() is used to find page with
125  * starting hash and will issue RPC to fetch that page. If there is a hash
126  * collision the concerned page is removed.
127  *
128  *
129  * Server.
130  *
131  * identification of and access to overflow pages
132  *
133  * page format
134  *
135  * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
136  * a header lu_dirpage which describes the start/end hash, and whether this
137  * page is empty (contains no dir entry) or hash collide with next page.
138  * After client receives reply, several pages will be integrated into dir page
139  * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the
140  * lu_dirpage for this integrated page will be adjusted. See
141  * mdc_adjust_dirpages().
142  *
143  */
144 struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data,
145                              __u64 offset, int *partial_readdir_rc)
146 {
147         struct md_readdir_info mrinfo = {
148                                         .mr_blocking_ast = ll_md_blocking_ast };
149         struct page *page;
150         int rc;
151
152         rc = md_read_page(ll_i2mdexp(dir), op_data, &mrinfo, offset, &page);
153         if (rc != 0)
154                 return ERR_PTR(rc);
155
156         if (partial_readdir_rc && mrinfo.mr_partial_readdir_rc)
157                 *partial_readdir_rc = mrinfo.mr_partial_readdir_rc;
158
159         return page;
160 }
161
162 void ll_release_page(struct inode *inode, struct page *page,
163                      bool remove)
164 {
165         kunmap(page);
166
167         /* Always remove the page for striped dir, because the page is
168          * built from temporarily in LMV layer
169          */
170         if (inode && ll_dir_striped(inode)) {
171                 __free_page(page);
172                 return;
173         }
174
175         if (remove) {
176                 lock_page(page);
177                 if (likely(page->mapping != NULL))
178                         cfs_delete_from_page_cache(page);
179                 unlock_page(page);
180         }
181         put_page(page);
182 }
183
184 #ifdef HAVE_DIR_CONTEXT
185 int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data,
186                 struct dir_context *ctx, int *partial_readdir_rc)
187 {
188 #else
189 int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data,
190                 void *cookie, filldir_t filldir, int *partial_readdir_rc)
191 {
192 #endif
193         struct ll_sb_info *sbi = ll_i2sbi(inode);
194         __u64 pos = *ppos;
195         bool is_api32 = ll_need_32bit_api(sbi);
196         bool is_hash64 = test_bit(LL_SBI_64BIT_HASH, sbi->ll_flags);
197         struct page *page;
198         bool done = false;
199         struct llcrypt_str lltr = LLTR_INIT(NULL, 0);
200         int rc = 0;
201
202         ENTRY;
203
204         if (IS_ENCRYPTED(inode)) {
205                 rc = llcrypt_fname_alloc_buffer(inode, NAME_MAX, &lltr);
206                 if (rc < 0)
207                         RETURN(rc);
208         }
209
210         page = ll_get_dir_page(inode, op_data, pos, partial_readdir_rc);
211
212         while (rc == 0 && !done) {
213                 struct lu_dirpage *dp;
214                 struct lu_dirent  *ent;
215                 __u64 hash;
216                 __u64 next;
217
218                 if (IS_ERR(page)) {
219                         rc = PTR_ERR(page);
220                         break;
221                 }
222
223                 hash = MDS_DIR_END_OFF;
224                 dp = page_address(page);
225                 for (ent = lu_dirent_start(dp); ent != NULL && !done;
226                      ent = lu_dirent_next(ent)) {
227                         __u16          type;
228                         int            namelen;
229                         struct lu_fid  fid;
230                         __u64          lhash;
231                         __u64          ino;
232
233                         hash = le64_to_cpu(ent->lde_hash);
234                         if (hash < pos) /* Skip until we find target hash */
235                                 continue;
236
237                         namelen = le16_to_cpu(ent->lde_namelen);
238                         if (namelen == 0) /* Skip dummy record */
239                                 continue;
240
241                         if (is_api32 && is_hash64)
242                                 lhash = hash >> 32;
243                         else
244                                 lhash = hash;
245                         fid_le_to_cpu(&fid, &ent->lde_fid);
246                         ino = cl_fid_build_ino(&fid, is_api32);
247                         type = S_DT(lu_dirent_type_get(ent));
248                         /* For ll_nfs_get_name_filldir(), it will try to access
249                          * 'ent' through 'lde_name', so the parameter 'name'
250                          * for 'filldir()' must be part of the 'ent'.
251                          */
252 #ifdef HAVE_DIR_CONTEXT
253                         ctx->pos = lhash;
254                         if (!IS_ENCRYPTED(inode)) {
255                                 done = !dir_emit(ctx, ent->lde_name, namelen,
256                                                  ino, type);
257                         } else {
258                                 /* Directory is encrypted */
259                                 int save_len = lltr.len;
260                                 struct llcrypt_str de_name =
261                                         LLTR_INIT(ent->lde_name, namelen);
262
263                                 rc = ll_fname_disk_to_usr(inode, 0, 0, &de_name,
264                                                           &lltr, &fid);
265                                 de_name = lltr;
266                                 lltr.len = save_len;
267                                 if (rc) {
268                                         done = 1;
269                                         break;
270                                 }
271                                 done = !dir_emit(ctx, de_name.name, de_name.len,
272                                                  ino, type);
273                         }
274 #else
275                         /* HAVE_DIR_CONTEXT is defined from kernel 3.11, whereas
276                          * IS_ENCRYPTED is brought by kernel 4.14.
277                          * So there is no need to handle encryption case here.
278                          */
279                         done = filldir(cookie, ent->lde_name, namelen, lhash,
280                                        ino, type);
281 #endif
282                 }
283
284                 if (done) {
285                         pos = hash;
286                         ll_release_page(inode, page, false);
287                         break;
288                 }
289
290                 next = le64_to_cpu(dp->ldp_hash_end);
291                 pos = next;
292                 if (pos == MDS_DIR_END_OFF) {
293                         /* End of directory reached. */
294                         done = 1;
295                         ll_release_page(inode, page, false);
296                 } else {
297                         /* Normal case: continue to the next page.*/
298                         ll_release_page(inode, page,
299                                         le32_to_cpu(dp->ldp_flags) &
300                                         LDF_COLLIDE);
301                         next = pos;
302                         page = ll_get_dir_page(inode, op_data, pos,
303                                                partial_readdir_rc);
304                 }
305         }
306 #ifdef HAVE_DIR_CONTEXT
307         ctx->pos = pos;
308 #else
309         *ppos = pos;
310 #endif
311         llcrypt_fname_free_buffer(&lltr);
312         RETURN(rc);
313 }
314
315 #ifdef HAVE_DIR_CONTEXT
316 static int ll_iterate(struct file *filp, struct dir_context *ctx)
317 #else
318 static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
319 #endif
320 {
321         struct inode *inode = file_inode(filp);
322         struct ll_file_data *lfd = filp->private_data;
323         struct ll_sb_info *sbi = ll_i2sbi(inode);
324         bool hash64 = test_bit(LL_SBI_64BIT_HASH, sbi->ll_flags);
325         int api32 = ll_need_32bit_api(sbi);
326         struct md_op_data *op_data;
327         struct lu_fid pfid = { 0 };
328         ktime_t kstart = ktime_get();
329         /* result of possible partial readdir */
330         int partial_readdir_rc = 0;
331         __u64 pos;
332         int rc;
333
334         ENTRY;
335
336         LASSERT(lfd != NULL);
337         pos = lfd->lfd_pos;
338
339         CDEBUG(D_VFSTRACE,
340                "VFS Op:inode="DFID"(%p) pos/size%lu/%llu 32bit_api %d\n",
341                PFID(ll_inode2fid(inode)),
342                inode, (unsigned long)pos, i_size_read(inode), api32);
343
344         if (IS_ENCRYPTED(inode)) {
345                 rc = llcrypt_prepare_readdir(inode);
346                 if (rc && rc != -ENOKEY)
347                         GOTO(out, rc);
348         }
349
350         if (pos == MDS_DIR_END_OFF)
351                 /* end-of-file. */
352                 GOTO(out, rc = 0);
353
354         if (unlikely(ll_dir_striped(inode))) {
355                 struct dentry *parent = dget_parent(file_dentry(filp));
356                 struct inode *i_dir = d_inode(parent);
357
358                 /* Only needed for striped dir to fill ..see lmv_read_page() */
359                 if (i_dir) {
360                         struct obd_export *exp = ll_i2mdexp(i_dir);
361                         __u64 ibits = MDS_INODELOCK_LOOKUP;
362
363                         if (ll_have_md_lock(exp, i_dir, &ibits, LCK_MINMODE))
364                                 pfid = *ll_inode2fid(i_dir);
365                 }
366                 dput(parent);
367
368                 /* If it can not find in cache, do lookup on the master obj */
369                 if (fid_is_zero(&pfid)) {
370                         rc = ll_dir_get_parent_fid(inode, &pfid);
371                         if (rc != 0)
372                                 RETURN(rc);
373                 }
374         }
375
376         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
377                                      LUSTRE_OPC_ANY, inode);
378         if (IS_ERR(op_data))
379                 GOTO(out, rc = PTR_ERR(op_data));
380
381         /* foreign dirs are browsed out of Lustre */
382         if (unlikely(lmv_dir_foreign(op_data->op_lso1))) {
383                 ll_finish_md_op_data(op_data);
384                 RETURN(-ENODATA);
385         }
386
387         op_data->op_fid3 = pfid;
388
389 #ifdef HAVE_DIR_CONTEXT
390         ctx->pos = pos;
391         rc = ll_dir_read(inode, &pos, op_data, ctx, &partial_readdir_rc);
392         pos = ctx->pos;
393 #else
394         rc = ll_dir_read(inode, &pos, op_data, cookie, filldir,
395                          &partial_readdir_rc);
396 #endif
397         lfd->lfd_pos = pos;
398         if (!lfd->fd_partial_readdir_rc)
399                 lfd->fd_partial_readdir_rc = partial_readdir_rc;
400
401         if (pos == MDS_DIR_END_OFF) {
402                 if (api32)
403                         pos = LL_DIR_END_OFF_32BIT;
404                 else
405                         pos = LL_DIR_END_OFF;
406         } else {
407                 if (api32 && hash64)
408                         pos = pos >> 32;
409         }
410 #ifdef HAVE_DIR_CONTEXT
411         ctx->pos = pos;
412 #else
413         filp->f_pos = pos;
414 #endif
415         ll_finish_md_op_data(op_data);
416
417 out:
418         if (!rc)
419                 ll_stats_ops_tally(sbi, LPROC_LL_READDIR,
420                                    ktime_us_delta(ktime_get(), kstart));
421
422         RETURN(rc);
423 }
424
425 /*
426  * Create striped directory with specified stripe(@lump)
427  *
428  * \param[in] dparent   the parent of the directory.
429  * \param[in] lump      the specified stripes.
430  * \param[in] dirname   the name of the directory.
431  * \param[in] mode      the specified mode of the directory.
432  *
433  * \retval              =0 if striped directory is being created successfully.
434  *                      <0 if the creation is failed.
435  */
436 static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
437                                size_t len, const char *dirname, umode_t mode,
438                                bool createonly)
439 {
440         struct inode *parent = dparent->d_inode;
441         struct ptlrpc_request *request = NULL;
442         struct md_op_data *op_data;
443         struct ll_sb_info *sbi = ll_i2sbi(parent);
444         struct inode *inode = NULL;
445         struct dentry dentry = {
446                 .d_parent = dparent,
447                 .d_name = {
448                         .name = dirname,
449                         .len = strlen(dirname),
450                         .hash = ll_full_name_hash(dparent, dirname,
451                                                   strlen(dirname)),
452                 },
453                 .d_sb = dparent->d_sb,
454         };
455         bool encrypt = false;
456         int hash_flags;
457         int err;
458
459         ENTRY;
460         if (unlikely(!lmv_user_magic_supported(lump->lum_magic)))
461                 RETURN(-EINVAL);
462
463         if (lump->lum_magic != LMV_MAGIC_FOREIGN) {
464                 CDEBUG(D_VFSTRACE,
465                        "VFS Op:inode="DFID"(%p) name=%s stripe_offset=%d stripe_count=%u, hash_type=%x\n",
466                        PFID(ll_inode2fid(parent)), parent, dirname,
467                        (int)lump->lum_stripe_offset, lump->lum_stripe_count,
468                        lump->lum_hash_type);
469         } else {
470                 struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lump;
471
472                 CDEBUG(D_VFSTRACE,
473                        "VFS Op:inode="DFID"(%p) name %s foreign, length %u, value '%.*s'\n",
474                        PFID(ll_inode2fid(parent)), parent, dirname,
475                        lfm->lfm_length, lfm->lfm_length, lfm->lfm_value);
476         }
477
478         if (lump->lum_stripe_count > 1 &&
479             !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_DIR_STRIPE))
480                 RETURN(-EINVAL);
481
482         if (IS_DEADDIR(parent) &&
483             !CFS_FAIL_CHECK(OBD_FAIL_LLITE_NO_CHECK_DEAD))
484                 RETURN(-ENOENT);
485
486         /* MDS < 2.14 doesn't support 'crush' hash type, and cannot handle
487          * unknown hash if client doesn't set a valid one. switch to fnv_1a_64.
488          */
489         if (CFS_FAIL_CHECK(OBD_FAIL_LMV_UNKNOWN_STRIPE)) {
490                 lump->lum_hash_type = cfs_fail_val;
491         } else if (!(exp_connect_flags2(sbi->ll_md_exp) & OBD_CONNECT2_CRUSH)) {
492                 enum lmv_hash_type type = lump->lum_hash_type &
493                                           LMV_HASH_TYPE_MASK;
494
495                 if (type >= LMV_HASH_TYPE_CRUSH ||
496                     type == LMV_HASH_TYPE_UNKNOWN)
497                         lump->lum_hash_type = (lump->lum_hash_type ^ type) |
498                                               LMV_HASH_TYPE_FNV_1A_64;
499         }
500
501         hash_flags = lump->lum_hash_type & ~LMV_HASH_TYPE_MASK;
502         if (hash_flags & ~LMV_HASH_FLAG_KNOWN)
503                 RETURN(-EINVAL);
504
505         if (unlikely(!lmv_user_magic_supported(cpu_to_le32(lump->lum_magic))))
506                 lustre_swab_lmv_user_md(lump);
507
508         if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
509                 mode &= ~current_umask();
510         mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
511         op_data = ll_prep_md_op_data(NULL, parent, NULL, dirname,
512                                      strlen(dirname), mode, LUSTRE_OPC_MKDIR,
513                                      lump);
514         if (IS_ERR(op_data))
515                 RETURN(PTR_ERR(op_data));
516
517         op_data->op_dir_depth = ll_i2info(parent)->lli_inherit_depth ?:
518                                 ll_i2info(parent)->lli_dir_depth;
519
520         if (ll_sbi_has_encrypt(sbi) &&
521             (IS_ENCRYPTED(parent) ||
522              unlikely(ll_sb_has_test_dummy_encryption(parent->i_sb)))) {
523                 err = llcrypt_prepare_readdir(parent);
524                 if (err)
525                         GOTO(out_op_data, err);
526                 if (!llcrypt_has_encryption_key(parent))
527                         GOTO(out_op_data, err = -ENOKEY);
528                 encrypt = true;
529         }
530
531         if (test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags)) {
532                 /* selinux_dentry_init_security() uses dentry->d_parent and name
533                  * to determine the security context for the file. So our fake
534                  * dentry should be real enough for this purpose.
535                  */
536                 err = ll_dentry_init_security(&dentry, mode, &dentry.d_name,
537                                               &op_data->op_file_secctx_name,
538                                               &op_data->op_file_secctx_name_size,
539                                               &op_data->op_file_secctx,
540                                               &op_data->op_file_secctx_size,
541                                               &op_data->op_file_secctx_slot);
542                 if (err < 0)
543                         GOTO(out_op_data, err);
544         }
545
546         if (encrypt) {
547                 err = llcrypt_inherit_context(parent, NULL, op_data, false);
548                 if (err)
549                         GOTO(out_op_data, err);
550         }
551
552         op_data->op_cli_flags |= CLI_SET_MEA;
553         if (createonly)
554                 op_data->op_bias |= MDS_SETSTRIPE_CREATE;
555
556         err = md_create(sbi->ll_md_exp, op_data, lump, len, mode,
557                         from_kuid(&init_user_ns, current_fsuid()),
558                         from_kgid(&init_user_ns, current_fsgid()),
559                         current_cap(), 0, &request);
560         if (err)
561                 GOTO(out_request, err);
562
563         CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_SETDIRSTRIPE_PAUSE, cfs_fail_val);
564
565         err = ll_prep_inode(&inode, &request->rq_pill, parent->i_sb, NULL);
566         if (err)
567                 GOTO(out_inode, err);
568
569         dentry.d_inode = inode;
570
571         if (test_bit(LL_SBI_FILE_SECCTX, sbi->ll_flags))
572                 err = ll_inode_notifysecctx(inode, op_data->op_file_secctx,
573                                             op_data->op_file_secctx_size);
574         else
575                 err = ll_inode_init_security(&dentry, inode, parent);
576
577         if (err)
578                 GOTO(out_inode, err);
579
580         if (encrypt) {
581                 err = ll_set_encflags(inode, op_data->op_file_encctx,
582                                       op_data->op_file_encctx_size, false);
583                 if (err)
584                         GOTO(out_inode, err);
585         }
586
587 out_inode:
588         iput(inode);
589 out_request:
590         ptlrpc_req_finished(request);
591 out_op_data:
592         ll_finish_md_op_data(op_data);
593
594         return err;
595 }
596
597 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
598                      int set_default)
599 {
600         struct ll_sb_info *sbi = ll_i2sbi(inode);
601         struct md_op_data *op_data;
602         struct ptlrpc_request *req = NULL;
603         int rc = 0;
604         int lum_size;
605
606         ENTRY;
607         if (lump != NULL) {
608                 switch (lump->lmm_magic) {
609                 case LOV_USER_MAGIC_V1:
610                         lum_size = sizeof(struct lov_user_md_v1);
611                         break;
612                 case LOV_USER_MAGIC_V3:
613                         lum_size = sizeof(struct lov_user_md_v3);
614                         break;
615                 case LOV_USER_MAGIC_COMP_V1:
616                         lum_size = ((struct lov_comp_md_v1 *)lump)->lcm_size;
617                         break;
618                 case LMV_USER_MAGIC: {
619                         struct lmv_user_md *lmv = (struct lmv_user_md *)lump;
620
621                         /* MDS < 2.14 doesn't support 'crush' hash type, and
622                          * cannot handle unknown hash if client doesn't set a
623                          * valid one. switch to fnv_1a_64.
624                          */
625                         if (!(exp_connect_flags2(sbi->ll_md_exp) &
626                               OBD_CONNECT2_CRUSH)) {
627                                 enum lmv_hash_type type = lmv->lum_hash_type &
628                                                           LMV_HASH_TYPE_MASK;
629
630                                 if (type >= LMV_HASH_TYPE_CRUSH ||
631                                     type == LMV_HASH_TYPE_UNKNOWN)
632                                         lmv->lum_hash_type =
633                                                 (lmv->lum_hash_type ^ type) |
634                                                 LMV_HASH_TYPE_FNV_1A_64;
635                         }
636                         if (lmv->lum_magic != cpu_to_le32(LMV_USER_MAGIC))
637                                 lustre_swab_lmv_user_md(lmv);
638                         lum_size = sizeof(*lmv);
639                         break;
640                 }
641                 case LOV_USER_MAGIC_SPECIFIC: {
642                         struct lov_user_md_v3 *v3 =
643                                 (struct lov_user_md_v3 *)lump;
644                         if (v3->lmm_stripe_count > LOV_MAX_STRIPE_COUNT)
645                                 RETURN(-EINVAL);
646                         lum_size = lov_user_md_size(v3->lmm_stripe_count,
647                                                     LOV_USER_MAGIC_SPECIFIC);
648                         break;
649                 }
650                 default:
651                         CDEBUG(D_IOCTL,
652                                "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
653                                lump->lmm_magic, LOV_USER_MAGIC_V1,
654                                LOV_USER_MAGIC_V3);
655                         RETURN(-EINVAL);
656                 }
657
658                 /* This is coming from userspace, so should be in
659                  * local endian.  But the MDS would like it in little
660                  * endian, so we swab it before we send it.
661                  */
662                 if ((__swab32(lump->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
663                     le32_to_cpu(LOV_MAGIC_MAGIC))
664                         lustre_swab_lov_user_md(lump, 0);
665         } else {
666                 lum_size = sizeof(struct lov_user_md_v1);
667         }
668
669         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
670                                      LUSTRE_OPC_ANY, NULL);
671         if (IS_ERR(op_data))
672                 RETURN(PTR_ERR(op_data));
673
674         /* swabbing is done in lov_setstripe() on server side */
675         rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, &req);
676         ll_finish_md_op_data(op_data);
677         ptlrpc_req_finished(req);
678         if (rc)
679                 RETURN(rc);
680
681         RETURN(rc);
682 }
683
684 /* get default LMV from client cache */
685 static int ll_dir_get_default_lmv(struct inode *inode, struct lmv_user_md *lum)
686 {
687         struct ll_inode_info *lli = ll_i2info(inode);
688         const struct lmv_stripe_md *lsm;
689         bool fs_dmv_got = false;
690         int rc = -ENODATA;
691
692         ENTRY;
693 retry:
694         if (lli->lli_def_lsm_obj) {
695                 down_read(&lli->lli_lsm_sem);
696                 lsm = &lli->lli_def_lsm_obj->lso_lsm;
697                 if (lsm) {
698                         lum->lum_magic = lsm->lsm_md_magic;
699                         lum->lum_stripe_count = lsm->lsm_md_stripe_count;
700                         lum->lum_stripe_offset = lsm->lsm_md_master_mdt_index;
701                         lum->lum_hash_type = lsm->lsm_md_hash_type;
702                         lum->lum_max_inherit = lsm->lsm_md_max_inherit;
703                         lum->lum_max_inherit_rr = lsm->lsm_md_max_inherit_rr;
704                         rc = 0;
705                 }
706                 up_read(&lli->lli_lsm_sem);
707         }
708
709         if (rc == -ENODATA && !is_root_inode(inode) && !fs_dmv_got) {
710                 lli = ll_i2info(inode->i_sb->s_root->d_inode);
711                 fs_dmv_got = true;
712                 goto retry;
713         }
714
715         if (!rc && fs_dmv_got) {
716                 lli = ll_i2info(inode);
717                 if (lum->lum_max_inherit != LMV_INHERIT_UNLIMITED) {
718                         if (lum->lum_max_inherit == LMV_INHERIT_NONE ||
719                             lum->lum_max_inherit < LMV_INHERIT_END ||
720                             lum->lum_max_inherit > LMV_INHERIT_MAX ||
721                             lum->lum_max_inherit <= lli->lli_dir_depth)
722                                 GOTO(out, rc = -ENODATA);
723
724                         lum->lum_max_inherit -= lli->lli_dir_depth;
725                 }
726
727                 if (lum->lum_max_inherit_rr != LMV_INHERIT_RR_UNLIMITED) {
728                         if (lum->lum_max_inherit_rr == LMV_INHERIT_NONE ||
729                             lum->lum_max_inherit_rr < LMV_INHERIT_RR_END ||
730                             lum->lum_max_inherit_rr > LMV_INHERIT_RR_MAX ||
731                             lum->lum_max_inherit_rr <= lli->lli_dir_depth)
732                                 lum->lum_max_inherit_rr = LMV_INHERIT_RR_NONE;
733
734                         if (lum->lum_max_inherit_rr > lli->lli_dir_depth)
735                                 lum->lum_max_inherit_rr -= lli->lli_dir_depth;
736                 }
737         }
738 out:
739         RETURN(rc);
740 }
741
742 int ll_dir_get_default_layout(struct inode *inode, void **plmm, int *plmm_size,
743                               struct ptlrpc_request **request, u64 valid,
744                               enum get_default_layout_type type)
745 {
746         struct ll_sb_info *sbi = ll_i2sbi(inode);
747         struct mdt_body   *body;
748         struct lov_mds_md *lmm = NULL;
749         struct ptlrpc_request *req = NULL;
750         int lmm_size = OBD_MAX_DEFAULT_EA_SIZE;
751         struct md_op_data *op_data;
752         struct lu_fid fid;
753         int rc;
754
755         ENTRY;
756
757         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, lmm_size,
758                                      LUSTRE_OPC_ANY, NULL);
759         if (IS_ERR(op_data))
760                 RETURN(PTR_ERR(op_data));
761
762         op_data->op_valid = valid | OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
763
764         if (type == GET_DEFAULT_LAYOUT_ROOT) {
765                 lu_root_fid(&op_data->op_fid1);
766                 fid = op_data->op_fid1;
767         } else {
768                 fid = *ll_inode2fid(inode);
769         }
770
771         rc = md_getattr(sbi->ll_md_exp, op_data, &req);
772         ll_finish_md_op_data(op_data);
773         if (rc < 0) {
774                 CDEBUG(D_INFO, "md_getattr failed on inode "DFID": rc %d\n",
775                        PFID(&fid), rc);
776                 GOTO(out, rc);
777         }
778
779         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
780         LASSERT(body != NULL);
781
782         lmm_size = body->mbo_eadatasize;
783
784         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
785             lmm_size == 0) {
786                 GOTO(out, rc = -ENODATA);
787         }
788
789         lmm = req_capsule_server_sized_get(&req->rq_pill,
790                                            &RMF_MDT_MD, lmm_size);
791         LASSERT(lmm != NULL);
792
793         /* This is coming from the MDS, so is probably in
794          * little endian.  We convert it to host endian before
795          * passing it to userspace.
796          */
797         /* We don't swab objects for directories */
798         switch (le32_to_cpu(lmm->lmm_magic)) {
799         case LOV_MAGIC_V1:
800         case LOV_MAGIC_V3:
801         case LOV_MAGIC_COMP_V1:
802         case LOV_USER_MAGIC_SPECIFIC:
803                 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
804                         lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
805                 break;
806         case LMV_MAGIC_V1:
807                 if (LMV_MAGIC != cpu_to_le32(LMV_MAGIC))
808                         lustre_swab_lmv_mds_md((union lmv_mds_md *)lmm);
809                 break;
810         case LMV_USER_MAGIC:
811                 if (LMV_USER_MAGIC != cpu_to_le32(LMV_USER_MAGIC))
812                         lustre_swab_lmv_user_md((struct lmv_user_md *)lmm);
813                 break;
814         case LMV_MAGIC_FOREIGN: {
815                 struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lmm;
816
817                 if (LMV_MAGIC_FOREIGN != cpu_to_le32(LMV_MAGIC_FOREIGN)) {
818                         __swab32s(&lfm->lfm_magic);
819                         __swab32s(&lfm->lfm_length);
820                         __swab32s(&lfm->lfm_type);
821                         __swab32s(&lfm->lfm_flags);
822                 }
823                 break;
824         }
825         default:
826                 rc = -EPROTO;
827                 CERROR("%s: unknown magic: %lX: rc = %d\n", sbi->ll_fsname,
828                        (unsigned long)lmm->lmm_magic, rc);
829         }
830 out:
831         *plmm = lmm;
832         *plmm_size = lmm_size;
833         *request = req;
834         return rc;
835 }
836
837 /*
838  * This function will be used to get default LOV/LMV/Default LMV
839  * @valid will be used to indicate which stripe it will retrieve.
840  * If the directory does not have its own default layout, then the
841  * function will request the default layout from root FID.
842  *      OBD_MD_MEA              LMV stripe EA
843  *      OBD_MD_DEFAULT_MEA      Default LMV stripe EA
844  *      otherwise               Default LOV EA.
845  * Each time, it can only retrieve 1 stripe EA
846  **/
847 int ll_dir_getstripe_default(struct inode *inode, void **plmm, int *plmm_size,
848                              struct ptlrpc_request **request,
849                              struct ptlrpc_request **root_request,
850                              u64 valid)
851 {
852         struct ptlrpc_request *req = NULL;
853         struct ptlrpc_request *root_req = NULL;
854         struct lov_mds_md *lmm = NULL;
855         int lmm_size = 0;
856         int rc = 0;
857
858         ENTRY;
859         rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size,
860                                        &req, valid, 0);
861         if (rc == -ENODATA && !fid_is_root(ll_inode2fid(inode)) &&
862             !(valid & OBD_MD_MEA) && root_request != NULL) {
863                 int rc2 = ll_dir_get_default_layout(inode, (void **)&lmm,
864                                                     &lmm_size, &root_req, valid,
865                                                     GET_DEFAULT_LAYOUT_ROOT);
866                 if (rc2 == 0)
867                         rc = 0;
868         }
869
870         *plmm = lmm;
871         *plmm_size = lmm_size;
872         *request = req;
873         if (root_request != NULL)
874                 *root_request = root_req;
875
876         RETURN(rc);
877 }
878
879 /*
880  * This function will be used to get default LOV/LMV/Default LMV
881  * @valid will be used to indicate which stripe it will retrieve
882  *      OBD_MD_MEA              LMV stripe EA
883  *      OBD_MD_DEFAULT_MEA      Default LMV stripe EA
884  *      otherwise               Default LOV EA.
885  * Each time, it can only retrieve 1 stripe EA
886  **/
887 int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
888                      struct ptlrpc_request **request, u64 valid)
889 {
890         struct ptlrpc_request *req = NULL;
891         struct lov_mds_md *lmm = NULL;
892         int lmm_size = 0;
893         int rc = 0;
894
895         ENTRY;
896         rc = ll_dir_get_default_layout(inode, (void **)&lmm, &lmm_size,
897                                        &req, valid, 0);
898
899         *plmm = lmm;
900         *plmm_size = lmm_size;
901         *request = req;
902
903         RETURN(rc);
904 }
905
906 int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid)
907 {
908         struct md_op_data *op_data;
909         int rc;
910         int mdt_index;
911
912         ENTRY;
913         OBD_ALLOC_PTR(op_data);
914         if (op_data == NULL)
915                 RETURN(-ENOMEM);
916
917         op_data->op_flags |= MF_GET_MDT_IDX;
918         op_data->op_fid1 = *fid;
919         rc = md_getattr(sbi->ll_md_exp, op_data, NULL);
920         mdt_index = op_data->op_mds;
921         OBD_FREE_PTR(op_data);
922         if (rc < 0)
923                 RETURN(rc);
924
925         RETURN(mdt_index);
926 }
927
928 /*
929  *  Get MDT index for the inode.
930  */
931 int ll_get_mdt_idx(struct inode *inode)
932 {
933         return ll_get_mdt_idx_by_fid(ll_i2sbi(inode), ll_inode2fid(inode));
934 }
935
936 /*
937  * Generic handler to do any pre-copy work.
938  *
939  * It sends a first hsm_progress (with extent length == 0) to coordinator as a
940  * first information for it that real work has started.
941  *
942  * Moreover, for a ARCHIVE request, it will sample the file data version and
943  * store it in \a copy.
944  *
945  * \return 0 on success.
946  */
947 static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy)
948 {
949         struct ll_sb_info *sbi = ll_s2sbi(sb);
950         struct hsm_progress_kernel hpk;
951         int rc = 0;
952         int rc2;
953
954         ENTRY;
955         /* Forge a hsm_progress based on data from copy. */
956         hpk.hpk_fid = copy->hc_hai.hai_fid;
957         hpk.hpk_cookie = copy->hc_hai.hai_cookie;
958         hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset;
959         hpk.hpk_extent.length = 0;
960         hpk.hpk_flags = 0;
961         hpk.hpk_errval = 0;
962         hpk.hpk_data_version = 0;
963
964
965         /* For archive request, we need to read the current file version. */
966         if (copy->hc_hai.hai_action == HSMA_ARCHIVE) {
967                 struct inode    *inode;
968                 __u64            data_version = 0;
969
970                 /* Get inode for this fid */
971                 inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
972                 if (IS_ERR(inode)) {
973                         hpk.hpk_flags |= HP_FLAG_RETRY;
974                         /* hpk_errval is >= 0 */
975                         hpk.hpk_errval = -PTR_ERR(inode);
976                         GOTO(progress, rc = PTR_ERR(inode));
977                 }
978
979                 /* Read current file data version */
980                 rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH);
981                 iput(inode);
982                 if (rc != 0) {
983                         CDEBUG(D_HSM, "Could not read file data version of "
984                                       DFID" (rc = %d). Archive request ("
985                                       "%#llx) could not be done.\n",
986                                       PFID(&copy->hc_hai.hai_fid), rc,
987                                       copy->hc_hai.hai_cookie);
988                         hpk.hpk_flags |= HP_FLAG_RETRY;
989                         /* hpk_errval must be >= 0 */
990                         hpk.hpk_errval = -rc;
991                         GOTO(progress, rc);
992                 }
993
994                 /* Store in the hsm_copy for later copytool use.
995                  * Always modified even if no lsm.
996                  */
997                 copy->hc_data_version = data_version;
998         }
999
1000 progress:
1001         /* On error, the request should be considered as completed */
1002         if (hpk.hpk_errval > 0)
1003                 hpk.hpk_flags |= HP_FLAG_COMPLETED;
1004
1005         rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
1006                             &hpk, NULL);
1007
1008         /* Return first error */
1009         RETURN(rc != 0 ? rc : rc2);
1010 }
1011
1012 /*
1013  * Generic handler to do any post-copy work.
1014  *
1015  * It will send the last hsm_progress update to coordinator to inform it
1016  * that copy is finished and whether it was successful or not.
1017  *
1018  * Moreover,
1019  * - for ARCHIVE request, it will sample the file data version and compare it
1020  *   with the version saved in ll_ioc_copy_start(). If they do not match, copy
1021  *   will be considered as failed.
1022  * - for RESTORE request, it will sample the file data version and send it to
1023  *   coordinator which is useful if the file was imported as 'released'.
1024  *
1025  * \return 0 on success.
1026  */
1027 static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
1028 {
1029         struct ll_sb_info *sbi = ll_s2sbi(sb);
1030         struct hsm_progress_kernel hpk;
1031         int rc = 0;
1032         int rc2;
1033
1034         ENTRY;
1035         /* If you modify the logic here, also check llapi_hsm_copy_end(). */
1036         /* Take care: copy->hc_hai.hai_action, len, gid and data are not
1037          * initialized if copy_end was called with copy == NULL.
1038          */
1039
1040         /* Forge a hsm_progress based on data from copy. */
1041         hpk.hpk_fid = copy->hc_hai.hai_fid;
1042         hpk.hpk_cookie = copy->hc_hai.hai_cookie;
1043         hpk.hpk_extent = copy->hc_hai.hai_extent;
1044         hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED;
1045         hpk.hpk_errval = copy->hc_errval;
1046         hpk.hpk_data_version = 0;
1047
1048         /* For archive request, we need to check the file data was not changed.
1049          *
1050          * For restore request, we need to send the file data version, this is
1051          * useful when the file was created using hsm_import.
1052          */
1053         if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) ||
1054              (copy->hc_hai.hai_action == HSMA_RESTORE)) &&
1055             (copy->hc_errval == 0)) {
1056                 struct inode    *inode;
1057                 __u64            data_version = 0;
1058
1059                 /* Get lsm for this fid */
1060                 inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
1061                 if (IS_ERR(inode)) {
1062                         hpk.hpk_flags |= HP_FLAG_RETRY;
1063                         /* hpk_errval must be >= 0 */
1064                         hpk.hpk_errval = -PTR_ERR(inode);
1065                         GOTO(progress, rc = PTR_ERR(inode));
1066                 }
1067
1068                 rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH);
1069                 iput(inode);
1070                 if (rc) {
1071                         CDEBUG(D_HSM,
1072                                "Could not read file data version. Request could not be confirmed.\n");
1073                         if (hpk.hpk_errval == 0)
1074                                 hpk.hpk_errval = -rc;
1075                         GOTO(progress, rc);
1076                 }
1077
1078                 /* Store in the hsm_copy for later copytool use.
1079                  * Always modified even if no lsm.
1080                  */
1081                 hpk.hpk_data_version = data_version;
1082
1083                 /* File could have been stripped during archiving, so we need
1084                  * to check anyway.
1085                  */
1086                 if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) &&
1087                     (copy->hc_data_version != data_version)) {
1088                         CDEBUG(D_HSM, "File data version mismatched. "
1089                               "File content was changed during archiving. "
1090                                DFID", start:%#llx current:%#llx\n",
1091                                PFID(&copy->hc_hai.hai_fid),
1092                                copy->hc_data_version, data_version);
1093                         /* File was changed, send error to cdt. Do not ask for
1094                          * retry because if a file is modified frequently,
1095                          * the cdt will loop on retried archive requests.
1096                          * The policy engine will ask for a new archive later
1097                          * when the file will not be modified for some tunable
1098                          * time
1099                          */
1100                         hpk.hpk_flags &= ~HP_FLAG_RETRY;
1101                         rc = -EBUSY;
1102                         /* hpk_errval must be >= 0 */
1103                         hpk.hpk_errval = -rc;
1104                         GOTO(progress, rc);
1105                 }
1106
1107         }
1108
1109 progress:
1110         rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
1111                             &hpk, NULL);
1112
1113         /* Return first error */
1114         RETURN(rc != 0 ? rc : rc2);
1115 }
1116
1117
1118 static int copy_and_ct_start(int cmd, struct obd_export *exp,
1119                              const struct lustre_kernelcomm __user *data)
1120 {
1121         struct lustre_kernelcomm *lk;
1122         struct lustre_kernelcomm *tmp;
1123         size_t size = sizeof(*lk);
1124         size_t new_size;
1125         int i;
1126         int rc;
1127
1128         /* copy data from userspace to get numbers of archive_id */
1129         OBD_ALLOC(lk, size);
1130         if (lk == NULL)
1131                 return -ENOMEM;
1132
1133         if (copy_from_user(lk, data, size))
1134                 GOTO(out_lk, rc = -EFAULT);
1135
1136         if (lk->lk_flags & LK_FLG_STOP)
1137                 goto do_ioctl;
1138
1139         if (!(lk->lk_flags & LK_FLG_DATANR)) {
1140                 __u32 archive_mask = lk->lk_data_count;
1141                 int count;
1142
1143                 /* old hsm agent to old MDS */
1144                 if (!exp_connect_archive_id_array(exp))
1145                         goto do_ioctl;
1146
1147                 /* old hsm agent to new MDS */
1148                 lk->lk_flags |= LK_FLG_DATANR;
1149
1150                 if (archive_mask == 0)
1151                         goto do_ioctl;
1152
1153                 count = hweight32(archive_mask);
1154                 new_size = offsetof(struct lustre_kernelcomm, lk_data[count]);
1155                 OBD_ALLOC(tmp, new_size);
1156                 if (tmp == NULL)
1157                         GOTO(out_lk, rc = -ENOMEM);
1158
1159                 memcpy(tmp, lk, size);
1160                 tmp->lk_data_count = count;
1161                 OBD_FREE(lk, size);
1162                 lk = tmp;
1163                 size = new_size;
1164
1165                 count = 0;
1166                 for (i = 0; i < sizeof(archive_mask) * 8; i++) {
1167                         if (BIT(i) & archive_mask) {
1168                                 lk->lk_data[count] = i + 1;
1169                                 count++;
1170                         }
1171                 }
1172                 goto do_ioctl;
1173         }
1174
1175         /* new hsm agent to new mds */
1176         if (lk->lk_data_count > 0) {
1177                 new_size = offsetof(struct lustre_kernelcomm,
1178                                     lk_data[lk->lk_data_count]);
1179                 OBD_ALLOC(tmp, new_size);
1180                 if (tmp == NULL)
1181                         GOTO(out_lk, rc = -ENOMEM);
1182
1183                 OBD_FREE(lk, size);
1184                 lk = tmp;
1185                 size = new_size;
1186
1187                 if (copy_from_user(lk, data, size))
1188                         GOTO(out_lk, rc = -EFAULT);
1189         }
1190
1191         /* new hsm agent to old MDS */
1192         if (!exp_connect_archive_id_array(exp)) {
1193                 __u32 archives = 0;
1194
1195                 if (lk->lk_data_count > LL_HSM_ORIGIN_MAX_ARCHIVE)
1196                         GOTO(out_lk, rc = -EINVAL);
1197
1198                 for (i = 0; i < lk->lk_data_count; i++) {
1199                         if (lk->lk_data[i] > LL_HSM_ORIGIN_MAX_ARCHIVE) {
1200                                 rc = -EINVAL;
1201                                 CERROR("%s: archive id %d requested but only [0 - %zu] supported: rc = %d\n",
1202                                        exp->exp_obd->obd_name, lk->lk_data[i],
1203                                        LL_HSM_ORIGIN_MAX_ARCHIVE, rc);
1204                                 GOTO(out_lk, rc);
1205                         }
1206
1207                         if (lk->lk_data[i] == 0) {
1208                                 archives = 0;
1209                                 break;
1210                         }
1211
1212                         archives |= (1 << (lk->lk_data[i] - 1));
1213                 }
1214                 lk->lk_flags &= ~LK_FLG_DATANR;
1215                 lk->lk_data_count = archives;
1216         }
1217 do_ioctl:
1218         rc = obd_iocontrol(cmd, exp, size, lk, NULL);
1219 out_lk:
1220         OBD_FREE(lk, size);
1221         return rc;
1222 }
1223
1224 static int check_owner(int type, int id)
1225 {
1226         switch (type) {
1227         case USRQUOTA:
1228                 if (!uid_eq(current_euid(), make_kuid(&init_user_ns, id)))
1229                         return -EPERM;
1230                 break;
1231         case GRPQUOTA:
1232                 if (!in_egroup_p(make_kgid(&init_user_ns, id)))
1233                         return -EPERM;
1234                 break;
1235         case PRJQUOTA:
1236                 break;
1237         }
1238         return 0;
1239 }
1240
1241 struct kmem_cache *quota_iter_slab;
1242 static DEFINE_MUTEX(quotactl_iter_lock);
1243
1244 struct ll_quotactl_iter_list {
1245         __u64            lqil_mark;      /* iter identifier */
1246         __u32            lqil_flags;     /* what has been done */
1247         pid_t            lqil_pid;       /* debug calling task */
1248         time64_t         lqil_iter_time; /* the time to iter */
1249         struct list_head lqil_sbi_list;  /* list on ll_sb_info */
1250         struct list_head lqil_quotactl_iter_list; /* list of quota iters */
1251 };
1252
1253 void ll_quota_iter_check_and_cleanup(struct ll_sb_info *sbi, bool check)
1254 {
1255         struct if_quotactl_iter *iter_rec = NULL;
1256         struct ll_quotactl_iter_list *tmp, *ll_iter = NULL;
1257
1258         if (!check)
1259                 mutex_lock(&quotactl_iter_lock);
1260
1261         list_for_each_entry_safe(ll_iter, tmp, &sbi->ll_all_quota_list,
1262                                  lqil_sbi_list) {
1263                 if (check &&
1264                     ll_iter->lqil_iter_time > (ktime_get_seconds() - 86400))
1265                         continue;
1266
1267                 while ((iter_rec = list_first_entry_or_null(
1268                                         &ll_iter->lqil_quotactl_iter_list,
1269                                         struct if_quotactl_iter,
1270                                         qci_link)) != NULL) {
1271                         list_del_init(&iter_rec->qci_link);
1272                         OBD_SLAB_FREE_PTR(iter_rec, quota_iter_slab);
1273                 }
1274
1275                 list_del_init(&ll_iter->lqil_sbi_list);
1276                 OBD_FREE_PTR(ll_iter);
1277         }
1278
1279         if (!check)
1280                 mutex_unlock(&quotactl_iter_lock);
1281 }
1282
1283 /* iterate the quota usage from all QSDs */
1284 static int quotactl_iter_acct(struct list_head *quota_list, void *buffer,
1285                               __u64 size, __u64 *count, __u32 qtype, bool is_md)
1286 {
1287         struct if_quotactl_iter *tmp, *iter = NULL;
1288         struct lquota_acct_rec *acct;
1289         __u64 qid, cur = 0;
1290         int rc = 0;
1291
1292         ENTRY;
1293
1294         while (cur < size) {
1295                 if ((size - cur) <
1296                     (sizeof(qid) + sizeof(*acct))) {
1297                         rc = -EPROTO;
1298                         break;
1299                 }
1300
1301                 qid = *((__u64 *)(buffer + cur));
1302                 cur += sizeof(qid);
1303                 acct = (struct lquota_acct_rec *)(buffer + cur);
1304                 cur += sizeof(*acct);
1305
1306                 iter = NULL;
1307                 list_for_each_entry(tmp, quota_list, qci_link) {
1308                         if (tmp->qci_qc.qc_id == (__u32)qid) {
1309                                 iter = tmp;
1310                                 break;
1311                         }
1312                 }
1313
1314                 if (iter == NULL) {
1315                         CDEBUG(D_QUOTA, "can't find the iter record for %llu\n",
1316                                qid);
1317
1318                         if (qid != 0)
1319                                 continue;
1320
1321                         OBD_SLAB_ALLOC_PTR(iter, quota_iter_slab);
1322                         if (iter == NULL) {
1323                                 rc = -ENOMEM;
1324                                 break;
1325                         }
1326
1327                         INIT_LIST_HEAD(&iter->qci_link);
1328                         iter->qci_qc.qc_id = 0;
1329                         iter->qci_qc.qc_type = qtype;
1330                         (*count)++;
1331
1332                         list_add(&iter->qci_link, quota_list);
1333                 }
1334
1335                 if (is_md) {
1336                         iter->qci_qc.qc_dqblk.dqb_valid |= QIF_INODES;
1337                         iter->qci_qc.qc_dqblk.dqb_curinodes += acct->ispace;
1338                         iter->qci_qc.qc_dqblk.dqb_curspace += acct->bspace;
1339                 } else {
1340                         iter->qci_qc.qc_dqblk.dqb_valid |= QIF_SPACE;
1341                         iter->qci_qc.qc_dqblk.dqb_curspace += acct->bspace;
1342                 }
1343         }
1344
1345         RETURN(rc);
1346 }
1347
1348 /* iterate all quota settings from QMT */
1349 static int quotactl_iter_glb(struct list_head *quota_list, void *buffer,
1350                              __u64 size, __u64 *count, __u32 qtype, bool is_md)
1351 {
1352         struct if_quotactl_iter *tmp, *iter = NULL;
1353         struct lquota_glb_rec *glb;
1354         __u64 qid, cur = 0;
1355         bool inserted = false;
1356         int rc = 0;
1357
1358         ENTRY;
1359
1360         while (cur < size) {
1361                 if ((size - cur) <
1362                     (sizeof(qid) + sizeof(*glb))) {
1363                         rc = -EPROTO;
1364                         break;
1365                 }
1366
1367                 qid = *((__u64 *)(buffer + cur));
1368                 cur += sizeof(qid);
1369                 glb = (struct lquota_glb_rec *)(buffer + cur);
1370                 cur += sizeof(*glb);
1371
1372                 iter = NULL;
1373                 list_for_each_entry(tmp, quota_list, qci_link) {
1374                         if (tmp->qci_qc.qc_id == (__u32)qid) {
1375                                 iter = tmp;
1376                                 break;
1377                         }
1378                 }
1379
1380                 if (iter == NULL) {
1381                         OBD_SLAB_ALLOC_PTR(iter, quota_iter_slab);
1382                         if (iter == NULL) {
1383                                 rc = -ENOMEM;
1384                                 break;
1385                         }
1386
1387                         INIT_LIST_HEAD(&iter->qci_link);
1388
1389                         inserted = false;
1390                         list_for_each_entry(tmp, quota_list, qci_link) {
1391                                 if (tmp->qci_qc.qc_id < qid)
1392                                         continue;
1393
1394                                 inserted = true;
1395                                 list_add_tail(&iter->qci_link,
1396                                               &tmp->qci_link);
1397                                 break;
1398                         }
1399
1400                         if (!inserted)
1401                                 list_add_tail(&iter->qci_link, quota_list);
1402
1403                         iter->qci_qc.qc_type = qtype;
1404                         iter->qci_qc.qc_id = (__u32)qid;
1405                         (*count)++;
1406                 }
1407
1408                 if (is_md) {
1409                         iter->qci_qc.qc_dqblk.dqb_valid |= QIF_ILIMITS;
1410                         iter->qci_qc.qc_dqblk.dqb_ihardlimit =
1411                                                              glb->qbr_hardlimit;
1412                         iter->qci_qc.qc_dqblk.dqb_isoftlimit =
1413                                                              glb->qbr_softlimit;
1414                         iter->qci_qc.qc_dqblk.dqb_itime = glb->qbr_time;
1415                 } else {
1416                         iter->qci_qc.qc_dqblk.dqb_valid |= QIF_BLIMITS;
1417                         iter->qci_qc.qc_dqblk.dqb_bhardlimit =
1418                                                              glb->qbr_hardlimit;
1419                         iter->qci_qc.qc_dqblk.dqb_bsoftlimit =
1420                                                              glb->qbr_softlimit;
1421                         iter->qci_qc.qc_dqblk.dqb_btime = glb->qbr_time;
1422                 }
1423         }
1424
1425         RETURN(rc);
1426 }
1427
1428 /* iterate the quota setting from QMT and all QSDs to get the quota information
1429  * for all users or groups
1430  **/
1431 static int quotactl_iter(struct ll_sb_info *sbi, struct if_quotactl *qctl)
1432 {
1433         struct list_head iter_quota_glb_list;
1434         struct list_head iter_obd_quota_md_list;
1435         struct list_head iter_obd_quota_dt_list;
1436         struct ll_quotactl_iter_list *ll_iter;
1437         struct lquota_iter *iter;
1438         struct obd_quotactl *oqctl;
1439         __u64 count;
1440         int rc = 0;
1441
1442         ENTRY;
1443
1444         OBD_ALLOC_PTR(ll_iter);
1445         if (ll_iter == NULL)
1446                 RETURN(-ENOMEM);
1447
1448         INIT_LIST_HEAD(&ll_iter->lqil_sbi_list);
1449         INIT_LIST_HEAD(&ll_iter->lqil_quotactl_iter_list);
1450
1451         mutex_lock(&quotactl_iter_lock);
1452
1453         if (!list_empty(&sbi->ll_all_quota_list))
1454                 ll_quota_iter_check_and_cleanup(sbi, true);
1455
1456         INIT_LIST_HEAD(&iter_quota_glb_list);
1457         INIT_LIST_HEAD(&iter_obd_quota_md_list);
1458         INIT_LIST_HEAD(&iter_obd_quota_dt_list);
1459
1460         OBD_ALLOC_PTR(oqctl);
1461         if (oqctl == NULL)
1462                 GOTO(out, rc = -ENOMEM);
1463
1464         QCTL_COPY(oqctl, qctl);
1465         oqctl->qc_iter_list = (__u64)&iter_quota_glb_list;
1466         rc = obd_quotactl(sbi->ll_md_exp, oqctl);
1467         if (rc)
1468                 GOTO(cleanup, rc);
1469
1470         QCTL_COPY(oqctl, qctl);
1471         oqctl->qc_cmd = LUSTRE_Q_ITEROQUOTA;
1472         oqctl->qc_iter_list = (__u64)&iter_obd_quota_md_list;
1473         rc = obd_quotactl(sbi->ll_md_exp, oqctl);
1474         if (rc)
1475                 GOTO(cleanup, rc);
1476
1477         QCTL_COPY(oqctl, qctl);
1478         oqctl->qc_cmd = LUSTRE_Q_ITEROQUOTA;
1479         oqctl->qc_iter_list = (__u64)&iter_obd_quota_dt_list;
1480         rc = obd_quotactl(sbi->ll_dt_exp, oqctl);
1481         if (rc)
1482                 GOTO(cleanup, rc);
1483
1484         count = 0;
1485         while ((iter = list_first_entry_or_null(&iter_quota_glb_list,
1486                                                 struct lquota_iter, li_link))) {
1487                 void *buffer;
1488
1489                 buffer = iter->li_buffer;
1490                 rc = quotactl_iter_glb(&ll_iter->lqil_quotactl_iter_list,
1491                                        buffer, iter->li_md_size, &count,
1492                                        oqctl->qc_type, true);
1493                 if (rc)
1494                         GOTO(cleanup, rc);
1495
1496                 buffer = iter->li_buffer + LQUOTA_ITER_BUFLEN / 2;
1497                 rc = quotactl_iter_glb(&ll_iter->lqil_quotactl_iter_list,
1498                                        buffer, iter->li_dt_size, &count,
1499                                        oqctl->qc_type,  false);
1500
1501                 if (rc)
1502                         GOTO(cleanup, rc);
1503
1504                 list_del_init(&iter->li_link);
1505                 OBD_FREE_LARGE(iter,
1506                                sizeof(struct lquota_iter) + LQUOTA_ITER_BUFLEN);
1507         }
1508
1509         while ((iter = list_first_entry_or_null(&iter_obd_quota_md_list,
1510                                                 struct lquota_iter, li_link))) {
1511                 rc = quotactl_iter_acct(&ll_iter->lqil_quotactl_iter_list,
1512                                         iter->li_buffer, iter->li_md_size,
1513                                         &count, oqctl->qc_type, true);
1514                 if (rc)
1515                         GOTO(cleanup, rc);
1516
1517                 list_del_init(&iter->li_link);
1518                 OBD_FREE_LARGE(iter,
1519                                sizeof(struct lquota_iter) + LQUOTA_ITER_BUFLEN);
1520         }
1521
1522         while ((iter = list_first_entry_or_null(&iter_obd_quota_dt_list,
1523                                                 struct lquota_iter, li_link))) {
1524                 rc = quotactl_iter_acct(&ll_iter->lqil_quotactl_iter_list,
1525                                         iter->li_buffer, iter->li_dt_size,
1526                                         &count, oqctl->qc_type, false);
1527                 if (rc)
1528                         GOTO(cleanup, rc);
1529
1530                 list_del_init(&iter->li_link);
1531                 OBD_FREE_LARGE(iter,
1532                                sizeof(struct lquota_iter) + LQUOTA_ITER_BUFLEN);
1533         }
1534
1535         ll_iter->lqil_mark = ((__u64)current->pid << 32) |
1536                              ((__u32)qctl->qc_type << 8) |
1537                              (ktime_get_seconds() & 0xFFFFFF);
1538         ll_iter->lqil_flags = qctl->qc_type;
1539         ll_iter->lqil_pid = current->pid;
1540         ll_iter->lqil_iter_time = ktime_get_seconds();
1541
1542         list_add(&ll_iter->lqil_sbi_list, &sbi->ll_all_quota_list);
1543
1544         qctl->qc_allquota_count = count;
1545         qctl->qc_allquota_mark = ll_iter->lqil_mark;
1546         GOTO(out, rc);
1547
1548 cleanup:
1549         ll_quota_iter_check_and_cleanup(sbi, true);
1550
1551         while ((iter = list_first_entry_or_null(&iter_quota_glb_list,
1552                                                 struct lquota_iter, li_link))) {
1553                 list_del_init(&iter->li_link);
1554                 OBD_FREE_LARGE(iter,
1555                                sizeof(struct lquota_iter) + LQUOTA_ITER_BUFLEN);
1556         }
1557
1558         while ((iter = list_first_entry_or_null(&iter_obd_quota_md_list,
1559                                                 struct lquota_iter, li_link))) {
1560                 list_del_init(&iter->li_link);
1561                 OBD_FREE_LARGE(iter,
1562                                sizeof(struct lquota_iter) + LQUOTA_ITER_BUFLEN);
1563         }
1564
1565         while ((iter = list_first_entry_or_null(&iter_obd_quota_dt_list,
1566                                                 struct lquota_iter, li_link))) {
1567                 list_del_init(&iter->li_link);
1568                 OBD_FREE_LARGE(iter,
1569                                sizeof(struct lquota_iter) + LQUOTA_ITER_BUFLEN);
1570         }
1571
1572         OBD_FREE_PTR(ll_iter);
1573
1574 out:
1575         OBD_FREE_PTR(oqctl);
1576
1577         mutex_unlock(&quotactl_iter_lock);
1578         RETURN(rc);
1579 }
1580
1581 static int quotactl_getallquota(struct ll_sb_info *sbi,
1582                                 struct if_quotactl *qctl)
1583 {
1584         struct ll_quotactl_iter_list *ll_iter = NULL;
1585         struct if_quotactl_iter *iter = NULL;
1586         void __user *buffer = (void __user *)qctl->qc_allquota_buffer;
1587         __u64 cur = 0, count = qctl->qc_allquota_buflen;
1588         int rc = 0;
1589
1590         ENTRY;
1591
1592         mutex_lock(&quotactl_iter_lock);
1593
1594         while ((ll_iter = list_first_entry_or_null(&sbi->ll_all_quota_list,
1595                                                 struct ll_quotactl_iter_list,
1596                                                 lqil_sbi_list)) != NULL) {
1597                 if (qctl->qc_allquota_mark == ll_iter->lqil_mark)
1598                         break;
1599         }
1600
1601         if (!ll_iter) {
1602                 mutex_unlock(&quotactl_iter_lock);
1603                 RETURN(-EBUSY);
1604         }
1605
1606         while ((iter = list_first_entry_or_null(
1607                                         &ll_iter->lqil_quotactl_iter_list,
1608                                         struct if_quotactl_iter, qci_link))) {
1609                 if (count - cur < sizeof(struct if_quotactl)) {
1610                         rc = -ERANGE;
1611                         break;
1612                 }
1613
1614                 if (copy_to_user(buffer + cur, &iter->qci_qc,
1615                                  sizeof(struct if_quotactl))) {
1616                         rc = -EFAULT;
1617                         break;
1618                 }
1619
1620                 cur += sizeof(struct if_quotactl);
1621
1622                 list_del_init(&iter->qci_link);
1623                 OBD_SLAB_FREE_PTR(iter, quota_iter_slab);
1624         }
1625
1626         /* cleanup in case of error */
1627         while ((iter = list_first_entry_or_null(
1628                                         &ll_iter->lqil_quotactl_iter_list,
1629                                         struct if_quotactl_iter, qci_link))) {
1630                 list_del_init(&iter->qci_link);
1631                 OBD_SLAB_FREE_PTR(iter, quota_iter_slab);
1632         }
1633
1634         mutex_unlock(&quotactl_iter_lock);
1635
1636         RETURN(rc);
1637 }
1638
1639 int quotactl_ioctl(struct super_block *sb, struct if_quotactl *qctl)
1640 {
1641         struct ll_sb_info *sbi = ll_s2sbi(sb);
1642         int cmd = qctl->qc_cmd;
1643         int type = qctl->qc_type;
1644         int id = qctl->qc_id;
1645         int valid = qctl->qc_valid;
1646         int rc = 0;
1647
1648         ENTRY;
1649
1650         switch (cmd) {
1651         case Q_SETQUOTA:
1652         case Q_SETINFO:
1653         case LUSTRE_Q_SETDEFAULT:
1654         case LUSTRE_Q_SETQUOTAPOOL:
1655         case LUSTRE_Q_SETINFOPOOL:
1656         case LUSTRE_Q_SETDEFAULT_POOL:
1657         case LUSTRE_Q_DELETEQID:
1658         case LUSTRE_Q_RESETQID:
1659                 if (!capable(CAP_SYS_ADMIN))
1660                         RETURN(-EPERM);
1661
1662                 if (sb->s_flags & SB_RDONLY)
1663                         RETURN(-EROFS);
1664                 break;
1665         case Q_GETQUOTA:
1666         case LUSTRE_Q_GETDEFAULT:
1667         case LUSTRE_Q_GETQUOTAPOOL:
1668         case LUSTRE_Q_GETDEFAULT_POOL:
1669         case LUSTRE_Q_ITERQUOTA:
1670         case LUSTRE_Q_GETALLQUOTA:
1671                 if (check_owner(type, id) &&
1672                     (!capable(CAP_SYS_ADMIN)))
1673                         RETURN(-EPERM);
1674                 break;
1675         case Q_GETINFO:
1676         case LUSTRE_Q_GETINFOPOOL:
1677                 break;
1678         default:
1679                 CERROR("%s: unsupported quotactl op: %#x: rc = %d\n",
1680                        sbi->ll_fsname, cmd, -EOPNOTSUPP);
1681                 RETURN(-EOPNOTSUPP);
1682         }
1683
1684         if (cmd == LUSTRE_Q_ITERQUOTA) {
1685                 rc = quotactl_iter(sbi, qctl);
1686         } else if (cmd == LUSTRE_Q_GETALLQUOTA) {
1687                 rc = quotactl_getallquota(sbi, qctl);
1688         } else if (valid != QC_GENERAL) {
1689                 if (cmd == Q_GETINFO)
1690                         qctl->qc_cmd = Q_GETOINFO;
1691                 else if (cmd == Q_GETQUOTA ||
1692                          cmd == LUSTRE_Q_GETQUOTAPOOL)
1693                         qctl->qc_cmd = Q_GETOQUOTA;
1694                 else
1695                         RETURN(-EINVAL);
1696
1697                 switch (valid) {
1698                 case QC_MDTIDX:
1699                         rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
1700                                            sizeof(*qctl), qctl, NULL);
1701                         break;
1702                 case QC_OSTIDX:
1703                         rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp,
1704                                            sizeof(*qctl), qctl, NULL);
1705                         break;
1706                 case QC_UUID:
1707                         rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
1708                                            sizeof(*qctl), qctl, NULL);
1709                         if (rc == -EAGAIN)
1710                                 rc = obd_iocontrol(OBD_IOC_QUOTACTL,
1711                                                    sbi->ll_dt_exp,
1712                                                    sizeof(*qctl), qctl, NULL);
1713                         break;
1714                 default:
1715                         rc = -EINVAL;
1716                         break;
1717                 }
1718
1719                 qctl->qc_cmd = cmd;
1720                 if (rc)
1721                         RETURN(rc);
1722         } else {
1723                 struct obd_quotactl *oqctl;
1724                 int oqctl_len = sizeof(*oqctl);
1725
1726                 if (LUSTRE_Q_CMD_IS_POOL(cmd))
1727                         oqctl_len += LOV_MAXPOOLNAME + 1;
1728
1729                 OBD_ALLOC(oqctl, oqctl_len);
1730                 if (oqctl == NULL)
1731                         RETURN(-ENOMEM);
1732
1733                 QCTL_COPY(oqctl, qctl);
1734                 rc = obd_quotactl(sbi->ll_md_exp, oqctl);
1735                 if (rc) {
1736                         OBD_FREE(oqctl, oqctl_len);
1737                         RETURN(rc);
1738                 }
1739                 /* If QIF_SPACE is not set, client should collect the
1740                  * space usage from OSSs by itself
1741                  */
1742                 if ((cmd == Q_GETQUOTA || cmd == LUSTRE_Q_GETQUOTAPOOL) &&
1743                     !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) &&
1744                     !oqctl->qc_dqblk.dqb_curspace) {
1745                         struct obd_quotactl *oqctl_tmp;
1746                         int qctl_len = sizeof(*oqctl_tmp) + LOV_MAXPOOLNAME + 1;
1747
1748                         OBD_ALLOC(oqctl_tmp, qctl_len);
1749                         if (oqctl_tmp == NULL)
1750                                 GOTO(out, rc = -ENOMEM);
1751
1752                         if (cmd == LUSTRE_Q_GETQUOTAPOOL) {
1753                                 oqctl_tmp->qc_cmd = LUSTRE_Q_GETQUOTAPOOL;
1754                                 memcpy(oqctl_tmp->qc_poolname,
1755                                        qctl->qc_poolname,
1756                                        LOV_MAXPOOLNAME + 1);
1757                         } else {
1758                                 oqctl_tmp->qc_cmd = Q_GETOQUOTA;
1759                         }
1760                         oqctl_tmp->qc_id = oqctl->qc_id;
1761                         oqctl_tmp->qc_type = oqctl->qc_type;
1762
1763                         /* collect space usage from OSTs */
1764                         oqctl_tmp->qc_dqblk.dqb_curspace = 0;
1765                         rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp);
1766                         if (!rc || rc == -EREMOTEIO) {
1767                                 oqctl->qc_dqblk.dqb_curspace =
1768                                         oqctl_tmp->qc_dqblk.dqb_curspace;
1769                                 oqctl->qc_dqblk.dqb_valid |= QIF_SPACE;
1770                         }
1771
1772                         /* collect space & inode usage from MDTs */
1773                         oqctl_tmp->qc_cmd = Q_GETOQUOTA;
1774                         oqctl_tmp->qc_dqblk.dqb_curspace = 0;
1775                         oqctl_tmp->qc_dqblk.dqb_curinodes = 0;
1776                         rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp);
1777                         if (!rc || rc == -EREMOTEIO) {
1778                                 oqctl->qc_dqblk.dqb_curspace +=
1779                                         oqctl_tmp->qc_dqblk.dqb_curspace;
1780                                 oqctl->qc_dqblk.dqb_curinodes =
1781                                         oqctl_tmp->qc_dqblk.dqb_curinodes;
1782                                 oqctl->qc_dqblk.dqb_valid |= QIF_INODES;
1783                         } else {
1784                                 oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE;
1785                         }
1786
1787                         OBD_FREE(oqctl_tmp, qctl_len);
1788                 }
1789 out:
1790                 QCTL_COPY(qctl, oqctl);
1791                 OBD_FREE(oqctl, oqctl_len);
1792         }
1793
1794         RETURN(rc);
1795 }
1796
1797 static int ll_rmfid(struct file *file, void __user *arg)
1798 {
1799         const struct fid_array __user *ufa = arg;
1800         struct inode *inode = file_inode(file);
1801         struct ll_sb_info *sbi = ll_i2sbi(inode);
1802         struct fid_array *lfa = NULL, *lfa_new = NULL;
1803         int i, rc, *rcs = NULL;
1804         unsigned int nr;
1805         bool lfa_flag = false; /* lfa already free'ed */
1806         size_t size;
1807
1808         ENTRY;
1809         if (!capable(CAP_DAC_READ_SEARCH) &&
1810             !test_bit(LL_SBI_USER_FID2PATH, ll_i2sbi(inode)->ll_flags))
1811                 RETURN(-EPERM);
1812         /* Only need to get the buflen */
1813         if (get_user(nr, &ufa->fa_nr))
1814                 RETURN(-EFAULT);
1815         /* DoS protection */
1816         if (nr > OBD_MAX_FIDS_IN_ARRAY)
1817                 RETURN(-E2BIG);
1818
1819         size = offsetof(struct fid_array, fa_fids[nr]);
1820         OBD_ALLOC(lfa, size);
1821         if (!lfa)
1822                 RETURN(-ENOMEM);
1823         OBD_ALLOC_PTR_ARRAY(rcs, nr);
1824         if (!rcs)
1825                 GOTO(free_lfa, rc = -ENOMEM);
1826
1827         if (copy_from_user(lfa, arg, size))
1828                 GOTO(free_rcs, rc = -EFAULT);
1829
1830         /* In case of subdirectory mount, we need to make sure all the files
1831          * for which we want to remove FID are visible in the namespace.
1832          */
1833         if (!fid_is_root(&sbi->ll_root_fid)) {
1834                 int path_len = PATH_MAX, linkno;
1835                 struct getinfo_fid2path *gf;
1836                 int idx, last_idx = nr - 1;
1837
1838                 lfa_new = NULL;
1839
1840                 OBD_ALLOC(lfa_new, size);
1841                 if (!lfa_new)
1842                         GOTO(free_rcs, rc = -ENOMEM);
1843                 lfa_new->fa_nr = 0;
1844
1845                 gf = kmalloc(sizeof(*gf) + path_len + 1, GFP_NOFS);
1846                 if (!gf)
1847                         GOTO(free_lfa_new, rc = -ENOMEM);
1848
1849                 for (idx = 0; idx < nr; idx++) {
1850                         linkno = 0;
1851                         while (1) {
1852                                 memset(gf, 0, sizeof(*gf) + path_len + 1);
1853                                 gf->gf_fid = lfa->fa_fids[idx];
1854                                 gf->gf_pathlen = path_len;
1855                                 gf->gf_linkno = linkno;
1856                                 rc = __ll_fid2path(inode, gf,
1857                                                    sizeof(*gf) + gf->gf_pathlen,
1858                                                    gf->gf_pathlen);
1859                                 if (rc == -ENAMETOOLONG) {
1860                                         struct getinfo_fid2path *tmpgf;
1861
1862                                         path_len += PATH_MAX;
1863                                         tmpgf = krealloc(gf,
1864                                                      sizeof(*gf) + path_len + 1,
1865                                                      GFP_NOFS);
1866                                         if (!tmpgf) {
1867                                                 kfree(gf);
1868                                                 GOTO(free_lfa_new, rc = -ENOMEM);
1869                                         }
1870                                         gf = tmpgf;
1871                                         continue;
1872                                 }
1873                                 if (rc)
1874                                         break;
1875                                 if (gf->gf_linkno == linkno)
1876                                         break;
1877                                 linkno = gf->gf_linkno;
1878                         }
1879
1880                         if (!rc) {
1881                                 /* All the links for this fid are visible in the
1882                                  * mounted subdir. So add it to the list of fids
1883                                  * to remove.
1884                                  */
1885                                 lfa_new->fa_fids[lfa_new->fa_nr++] =
1886                                         lfa->fa_fids[idx];
1887                         } else {
1888                                 /* At least one link for this fid is not visible
1889                                  * in the mounted subdir. So add it at the end
1890                                  * of the list that will be hidden to lower
1891                                  * layers, and set -ENOENT as ret code.
1892                                  */
1893                                 lfa_new->fa_fids[last_idx] = lfa->fa_fids[idx];
1894                                 rcs[last_idx--] = rc;
1895                         }
1896                 }
1897                 kfree(gf);
1898                 OBD_FREE(lfa, size);
1899                 lfa_flag = true;
1900                 lfa = lfa_new;
1901         }
1902         if (lfa->fa_nr == 0)
1903                 GOTO(free_rcs, rc = rcs[nr - 1]);
1904
1905         /* Call mdc_iocontrol */
1906         rc = md_rmfid(ll_i2mdexp(file_inode(file)), lfa, rcs, NULL);
1907         lfa->fa_nr = nr;
1908         if (!rc) {
1909                 for (i = 0; i < nr; i++)
1910                         if (rcs[i])
1911                                 lfa->fa_fids[i].f_ver = rcs[i];
1912                 if (copy_to_user(arg, lfa, size))
1913                         rc = -EFAULT;
1914         }
1915
1916 free_lfa_new:
1917         OBD_FREE(lfa_new, size);
1918 free_rcs:
1919         OBD_FREE_PTR_ARRAY(rcs, nr);
1920 free_lfa:
1921         if (!lfa_flag)
1922                 OBD_FREE(lfa, size);
1923
1924         RETURN(rc);
1925 }
1926
1927 /* This function tries to get a single name component, to send to the server.
1928  * No actual path traversal involved, so we limit to NAME_MAX
1929  */
1930 static char *ll_getname(const char __user *filename)
1931 {
1932         int ret = 0, len;
1933         char *tmp;
1934
1935         OBD_ALLOC(tmp, NAME_MAX + 1);
1936
1937         if (!tmp)
1938                 return ERR_PTR(-ENOMEM);
1939
1940         len = strncpy_from_user(tmp, filename, NAME_MAX + 1);
1941         if (len < 0)
1942                 ret = -ENOENT;
1943         else if (len > NAME_MAX)
1944                 ret = -ENAMETOOLONG;
1945
1946         if (ret) {
1947                 OBD_FREE(tmp, NAME_MAX + 1);
1948                 tmp =  ERR_PTR(ret);
1949         }
1950         return tmp;
1951 }
1952
1953 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
1954
1955 #define ll_putname(filename) OBD_FREE(filename, NAME_MAX + 1);
1956
1957 static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1958 {
1959         struct dentry *dentry = file_dentry(file);
1960         struct inode *inode = file_inode(file);
1961         struct ll_sb_info *sbi = ll_i2sbi(inode);
1962         struct obd_ioctl_data *data = NULL;
1963         void __user *uarg = (void __user *)arg;
1964         int rc = 0;
1965
1966         ENTRY;
1967         CDEBUG(D_VFSTRACE|D_IOCTL, "VFS Op:inode="DFID"(%pK) cmd=%x arg=%lx\n",
1968                PFID(ll_inode2fid(inode)), inode, cmd, arg);
1969
1970         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1971         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1972                 return -ENOTTY;
1973
1974         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1975         switch (cmd) {
1976         case IOC_MDC_LOOKUP: {
1977                 int namelen, len = 0;
1978                 char *filename;
1979
1980                 rc = obd_ioctl_getdata(&data, &len, uarg);
1981                 if (rc != 0)
1982                         RETURN(rc);
1983
1984                 filename = data->ioc_inlbuf1;
1985                 namelen = strlen(filename);
1986                 if (namelen < 1) {
1987                         CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
1988                         GOTO(out_free, rc = -EINVAL);
1989                 }
1990
1991                 rc = ll_get_fid_by_name(inode, filename, namelen, NULL, NULL);
1992                 if (rc < 0) {
1993                         CERROR("%s: lookup %.*s failed: rc = %d\n",
1994                                sbi->ll_fsname, namelen, filename, rc);
1995                         GOTO(out_free, rc);
1996                 }
1997 out_free:
1998                 OBD_FREE_LARGE(data, len);
1999                 return rc;
2000         }
2001         case LL_IOC_LMV_SETSTRIPE: {
2002                 struct lmv_user_md  *lum;
2003                 char *filename;
2004                 int namelen = 0;
2005                 int lumlen = 0;
2006                 umode_t mode;
2007                 bool createonly = false;
2008                 int len;
2009                 int rc;
2010
2011                 rc = obd_ioctl_getdata(&data, &len, uarg);
2012                 if (rc)
2013                         RETURN(rc);
2014
2015                 if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
2016                     data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0)
2017                         GOTO(lmv_out_free, rc = -EINVAL);
2018
2019                 filename = data->ioc_inlbuf1;
2020                 namelen = data->ioc_inllen1;
2021
2022                 if (namelen < 1) {
2023                         CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
2024                         GOTO(lmv_out_free, rc = -EINVAL);
2025                 }
2026                 lum = (struct lmv_user_md *)data->ioc_inlbuf2;
2027                 lumlen = data->ioc_inllen2;
2028
2029                 if (!lmv_user_magic_supported(lum->lum_magic)) {
2030                         CERROR("%s: wrong lum magic %x : rc = %d\n", filename,
2031                                lum->lum_magic, -EINVAL);
2032                         GOTO(lmv_out_free, rc = -EINVAL);
2033                 }
2034
2035                 if ((lum->lum_magic == LMV_USER_MAGIC ||
2036                      lum->lum_magic == LMV_USER_MAGIC_SPECIFIC) &&
2037                     lumlen < sizeof(*lum)) {
2038                         CERROR("%s: wrong lum size %d for magic %x : rc = %d\n",
2039                                filename, lumlen, lum->lum_magic, -EINVAL);
2040                         GOTO(lmv_out_free, rc = -EINVAL);
2041                 }
2042
2043                 if (lum->lum_magic == LMV_MAGIC_FOREIGN &&
2044                     lumlen < sizeof(struct lmv_foreign_md)) {
2045                         CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
2046                                filename, lum->lum_magic, lumlen, -EFAULT);
2047                         GOTO(lmv_out_free, rc = -EINVAL);
2048                 }
2049
2050                 mode = data->ioc_type;
2051                 createonly = data->ioc_obdo1.o_flags & OBD_FL_OBDMDEXISTS;
2052                 rc = ll_dir_setdirstripe(dentry, lum, lumlen, filename, mode,
2053                                          createonly);
2054 lmv_out_free:
2055                 OBD_FREE_LARGE(data, len);
2056                 RETURN(rc);
2057
2058         }
2059         case LL_IOC_LMV_SET_DEFAULT_STRIPE: {
2060                 struct lmv_user_md lum;
2061                 struct lmv_user_md __user *ulump = uarg;
2062                 int rc;
2063
2064                 if (copy_from_user(&lum, ulump, sizeof(lum)))
2065                         RETURN(-EFAULT);
2066
2067                 if (lum.lum_magic != LMV_USER_MAGIC)
2068                         RETURN(-EINVAL);
2069
2070                 rc = ll_dir_setstripe(inode, (struct lov_user_md *)&lum, 0);
2071
2072                 RETURN(rc);
2073         }
2074         case LL_IOC_LOV_SETSTRIPE_NEW:
2075         case LL_IOC_LOV_SETSTRIPE: {
2076                 struct lov_user_md_v3 *lumv3 = NULL;
2077                 struct lov_user_md_v1 lumv1;
2078                 struct lov_user_md_v1 *lumv1_ptr = &lumv1;
2079                 struct lov_user_md_v1 __user *lumv1p = uarg;
2080                 struct lov_user_md_v3 __user *lumv3p = uarg;
2081                 int lum_size = 0;
2082                 int set_default = 0;
2083
2084                 BUILD_BUG_ON(sizeof(struct lov_user_md_v3) <=
2085                              sizeof(struct lov_comp_md_v1));
2086                 BUILD_BUG_ON(sizeof(*lumv3) != sizeof(*lumv3p));
2087                 /* first try with v1 which is smaller than v3 */
2088                 if (copy_from_user(&lumv1, lumv1p, sizeof(lumv1)))
2089                         RETURN(-EFAULT);
2090
2091                 if (is_root_inode(inode))
2092                         set_default = 1;
2093
2094                 switch (lumv1.lmm_magic) {
2095                 case LOV_USER_MAGIC_V3:
2096                 case LOV_USER_MAGIC_SPECIFIC:
2097                         lum_size = ll_lov_user_md_size(&lumv1);
2098                         if (lum_size < 0)
2099                                 RETURN(lum_size);
2100                         OBD_ALLOC(lumv3, lum_size);
2101                         if (!lumv3)
2102                                 RETURN(-ENOMEM);
2103                         if (copy_from_user(lumv3, lumv3p, lum_size))
2104                                 GOTO(out, rc = -EFAULT);
2105                         lumv1_ptr = (struct lov_user_md_v1 *)lumv3;
2106                         break;
2107                 case LOV_USER_MAGIC_V1:
2108                         break;
2109                 default:
2110                         GOTO(out, rc = -EOPNOTSUPP);
2111                 }
2112
2113                 /* in v1 and v3 cases lumv1 points to data */
2114                 rc = ll_dir_setstripe(inode, lumv1_ptr, set_default);
2115 out:
2116                 if (lumv3)
2117                         OBD_FREE(lumv3, lum_size);
2118                 RETURN(rc);
2119         }
2120         case LL_IOC_LMV_GETSTRIPE: {
2121                 struct lmv_user_md __user *ulmv = uarg;
2122                 struct lmv_user_md lum;
2123                 struct ptlrpc_request *request = NULL;
2124                 union lmv_mds_md *lmm = NULL;
2125                 int lmmsize;
2126                 u64 valid = 0;
2127                 struct lmv_user_md *tmp = NULL;
2128                 int mdt_index;
2129                 int lum_size;
2130                 int stripe_count;
2131                 int max_stripe_count;
2132                 int i;
2133                 int rc;
2134
2135                 if (copy_from_user(&lum, ulmv, sizeof(*ulmv)))
2136                         RETURN(-EFAULT);
2137
2138                 /* get default LMV */
2139                 if (lum.lum_magic == LMV_USER_MAGIC &&
2140                     lum.lum_type != LMV_TYPE_RAW) {
2141                         rc = ll_dir_get_default_lmv(inode, &lum);
2142                         if (rc)
2143                                 RETURN(rc);
2144
2145                         if (copy_to_user(ulmv, &lum, sizeof(lum)))
2146                                 RETURN(-EFAULT);
2147
2148                         RETURN(0);
2149                 }
2150
2151                 max_stripe_count = lum.lum_stripe_count;
2152                 /* lum_magic will indicate which stripe the ioctl will like
2153                  * to get, LMV_MAGIC_V1 is for normal LMV stripe, LMV_USER_MAGIC
2154                  * is for default LMV stripe
2155                  */
2156                 if (lum.lum_magic == LMV_MAGIC_V1)
2157                         valid |= OBD_MD_MEA;
2158                 else if (lum.lum_magic == LMV_USER_MAGIC)
2159                         valid |= OBD_MD_DEFAULT_MEA;
2160                 else
2161                         RETURN(-EINVAL);
2162
2163                 rc = ll_dir_getstripe_default(inode, (void **)&lmm, &lmmsize,
2164                                               &request, NULL, valid);
2165                 if (rc != 0)
2166                         GOTO(finish_req, rc);
2167
2168                 /* get default LMV in raw mode */
2169                 if (lum.lum_magic == LMV_USER_MAGIC) {
2170                         if (copy_to_user(ulmv, lmm, lmmsize))
2171                                 GOTO(finish_req, rc = -EFAULT);
2172                         GOTO(finish_req, rc);
2173                 }
2174
2175                 /* if foreign LMV case, fake stripes number */
2176                 if (lmm->lmv_magic == LMV_MAGIC_FOREIGN) {
2177                         struct lmv_foreign_md *lfm;
2178
2179                         lfm = (struct lmv_foreign_md *)lmm;
2180                         if (lfm->lfm_length < XATTR_SIZE_MAX -
2181                             offsetof(typeof(*lfm), lfm_value)) {
2182                                 __u32 size = lfm->lfm_length +
2183                                              offsetof(typeof(*lfm), lfm_value);
2184
2185                                 stripe_count = lmv_foreign_to_md_stripes(size);
2186                         } else {
2187                                 CERROR("%s: invalid %d foreign size returned: rc = %d\n",
2188                                        sbi->ll_fsname, lfm->lfm_length,
2189                                        -EINVAL);
2190                                 return -EINVAL;
2191                         }
2192                 } else {
2193                         stripe_count = lmv_mds_md_stripe_count_get(lmm);
2194                 }
2195                 if (max_stripe_count < stripe_count) {
2196                         lum.lum_stripe_count = stripe_count;
2197                         if (copy_to_user(ulmv, &lum, sizeof(lum)))
2198                                 GOTO(finish_req, rc = -EFAULT);
2199                         GOTO(finish_req, rc = -E2BIG);
2200                 }
2201
2202                 /* enough room on user side and foreign case */
2203                 if (lmm->lmv_magic == LMV_MAGIC_FOREIGN) {
2204                         struct lmv_foreign_md *lfm;
2205                         __u32 size;
2206
2207                         lfm = (struct lmv_foreign_md *)lmm;
2208                         size = lfm->lfm_length +
2209                                offsetof(struct lmv_foreign_md, lfm_value);
2210                         if (copy_to_user(ulmv, lfm, size))
2211                                 GOTO(finish_req, rc = -EFAULT);
2212                         GOTO(finish_req, rc);
2213                 }
2214
2215                 lum_size = lmv_user_md_size(stripe_count,
2216                                             LMV_USER_MAGIC_SPECIFIC);
2217                 OBD_ALLOC(tmp, lum_size);
2218                 if (tmp == NULL)
2219                         GOTO(finish_req, rc = -ENOMEM);
2220
2221                 mdt_index = ll_get_mdt_idx(inode);
2222                 if (mdt_index < 0)
2223                         GOTO(out_tmp, rc = -ENOMEM);
2224
2225                 tmp->lum_magic = LMV_MAGIC_V1;
2226                 tmp->lum_stripe_count = 0;
2227                 tmp->lum_stripe_offset = mdt_index;
2228                 tmp->lum_hash_type = lmv_mds_md_hash_type_get(lmm);
2229                 for (i = 0; i < stripe_count; i++) {
2230                         struct lu_fid   fid;
2231
2232                         fid_le_to_cpu(&fid, &lmm->lmv_md_v1.lmv_stripe_fids[i]);
2233                         if (fid_is_sane(&fid)) {
2234                                 mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid);
2235                                 if (mdt_index < 0)
2236                                         GOTO(out_tmp, rc = mdt_index);
2237
2238                                 tmp->lum_objects[i].lum_mds = mdt_index;
2239                                 tmp->lum_objects[i].lum_fid = fid;
2240                         }
2241
2242                         tmp->lum_stripe_count++;
2243                 }
2244
2245                 if (copy_to_user(ulmv, tmp, lum_size))
2246                         GOTO(out_tmp, rc = -EFAULT);
2247 out_tmp:
2248                 OBD_FREE(tmp, lum_size);
2249 finish_req:
2250                 ptlrpc_req_finished(request);
2251                 return rc;
2252         }
2253         case LL_IOC_REMOVE_ENTRY: {
2254                 char *filename = NULL;
2255                 int namelen = 0;
2256                 int rc;
2257
2258                 /* Here is a little hack to avoid sending REINT_RMENTRY to
2259                  * unsupported server, which might crash the server(LU-2730),
2260                  * Because both LVB_TYPE and REINT_RMENTRY will be supported
2261                  * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the
2262                  * server will support REINT_RMENTRY XXX
2263                  */
2264                 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE))
2265                         RETURN(-EOPNOTSUPP);
2266
2267                 filename = ll_getname(uarg);
2268                 if (IS_ERR(filename))
2269                         RETURN(PTR_ERR(filename));
2270
2271                 namelen = strlen(filename);
2272                 if (namelen < 1)
2273                         GOTO(out_rmdir, rc = -EINVAL);
2274
2275                 rc = ll_rmdir_entry(inode, filename, namelen);
2276 out_rmdir:
2277                 if (filename)
2278                         ll_putname(filename);
2279                 RETURN(rc);
2280         }
2281         case LL_IOC_RMFID:
2282                 RETURN(ll_rmfid(file, uarg));
2283         case LL_IOC_LOV_SWAP_LAYOUTS:
2284                 RETURN(-EPERM);
2285         case LL_IOC_LOV_GETSTRIPE:
2286         case LL_IOC_LOV_GETSTRIPE_NEW:
2287         case LL_IOC_MDC_GETINFO_V1:
2288         case LL_IOC_MDC_GETINFO_V2:
2289         case IOC_MDC_GETFILEINFO_V1:
2290         case IOC_MDC_GETFILEINFO_V2:
2291         case IOC_MDC_GETFILESTRIPE: {
2292                 struct ptlrpc_request *request = NULL;
2293                 struct ptlrpc_request *root_request = NULL;
2294                 struct lov_user_md __user *lump;
2295                 struct lov_mds_md *lmm = NULL;
2296                 struct mdt_body *body;
2297                 char *filename = NULL;
2298                 lstat_t __user *statp = NULL;
2299                 lstatx_t __user *stxp = NULL;
2300                 __u64 __user *flagsp = NULL;
2301                 __u32 __user *lmmsizep = NULL;
2302                 struct lu_fid __user *fidp = NULL;
2303                 int lmmsize;
2304                 bool api32;
2305
2306                 if (cmd == IOC_MDC_GETFILEINFO_V1 ||
2307                     cmd == IOC_MDC_GETFILEINFO_V2 ||
2308                     cmd == IOC_MDC_GETFILESTRIPE) {
2309                         filename = ll_getname(uarg);
2310                         if (IS_ERR(filename))
2311                                 RETURN(PTR_ERR(filename));
2312
2313                         rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
2314                                                       &lmmsize, &request);
2315                 } else {
2316                         rc = ll_dir_getstripe_default(inode, (void **)&lmm,
2317                                                       &lmmsize, &request,
2318                                                       &root_request, 0);
2319                 }
2320
2321                 if (request) {
2322                         body = req_capsule_server_get(&request->rq_pill,
2323                                                       &RMF_MDT_BODY);
2324                         LASSERT(body != NULL);
2325                 } else {
2326                         GOTO(out_req, rc);
2327                 }
2328
2329                 if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO_V1 ||
2330                                        cmd == LL_IOC_MDC_GETINFO_V1 ||
2331                                        cmd == IOC_MDC_GETFILEINFO_V2 ||
2332                                        cmd == LL_IOC_MDC_GETINFO_V2)) {
2333                         lmmsize = 0;
2334                         rc = 0;
2335                 }
2336
2337                 if (rc < 0)
2338                         GOTO(out_req, rc);
2339
2340                 if (cmd == IOC_MDC_GETFILESTRIPE ||
2341                     cmd == LL_IOC_LOV_GETSTRIPE ||
2342                     cmd == LL_IOC_LOV_GETSTRIPE_NEW) {
2343                         lump = uarg;
2344                 } else if (cmd == IOC_MDC_GETFILEINFO_V1 ||
2345                            cmd == LL_IOC_MDC_GETINFO_V1){
2346                         struct lov_user_mds_data_v1 __user *lmdp;
2347
2348                         lmdp = uarg;
2349                         statp = &lmdp->lmd_st;
2350                         lump = &lmdp->lmd_lmm;
2351                 } else {
2352                         struct lov_user_mds_data __user *lmdp;
2353
2354                         lmdp = uarg;
2355                         fidp = &lmdp->lmd_fid;
2356                         stxp = &lmdp->lmd_stx;
2357                         flagsp = &lmdp->lmd_flags;
2358                         lmmsizep = &lmdp->lmd_lmmsize;
2359                         lump = &lmdp->lmd_lmm;
2360                 }
2361
2362                 if (lmmsize == 0) {
2363                         /* If the file has no striping then zero out *lump so
2364                          * that the caller isn't confused by garbage.
2365                          */
2366                         if (clear_user(lump, sizeof(*lump)))
2367                                 GOTO(out_req, rc = -EFAULT);
2368                 } else if (copy_to_user(lump, lmm, lmmsize)) {
2369                         if (copy_to_user(lump, lmm, sizeof(*lump)))
2370                                 GOTO(out_req, rc = -EFAULT);
2371                         rc = -EOVERFLOW;
2372                 }
2373                 api32 = test_bit(LL_SBI_32BIT_API, sbi->ll_flags);
2374
2375                 if (cmd == IOC_MDC_GETFILEINFO_V1 ||
2376                     cmd == LL_IOC_MDC_GETINFO_V1) {
2377                         lstat_t st = { 0 };
2378
2379                         st.st_dev       = inode->i_sb->s_dev;
2380                         st.st_mode      = body->mbo_mode;
2381                         st.st_nlink     = body->mbo_nlink;
2382                         st.st_uid       = body->mbo_uid;
2383                         st.st_gid       = body->mbo_gid;
2384                         st.st_rdev      = body->mbo_rdev;
2385                         if (llcrypt_require_key(inode) == -ENOKEY)
2386                                 st.st_size = round_up(st.st_size,
2387                                                    LUSTRE_ENCRYPTION_UNIT_SIZE);
2388                         else
2389                                 st.st_size = body->mbo_size;
2390                         st.st_blksize   = PAGE_SIZE;
2391                         st.st_blocks    = body->mbo_blocks;
2392                         st.st_atime     = body->mbo_atime;
2393                         st.st_mtime     = body->mbo_mtime;
2394                         st.st_ctime     = body->mbo_ctime;
2395                         st.st_ino       = cl_fid_build_ino(&body->mbo_fid1,
2396                                                            api32);
2397
2398                         if (copy_to_user(statp, &st, sizeof(st)))
2399                                 GOTO(out_req, rc = -EFAULT);
2400                 } else if (cmd == IOC_MDC_GETFILEINFO_V2 ||
2401                            cmd == LL_IOC_MDC_GETINFO_V2) {
2402                         lstatx_t stx = { 0 };
2403                         __u64 valid = body->mbo_valid;
2404
2405                         stx.stx_blksize = PAGE_SIZE;
2406                         stx.stx_nlink = body->mbo_nlink;
2407                         stx.stx_uid = body->mbo_uid;
2408                         stx.stx_gid = body->mbo_gid;
2409                         stx.stx_mode = body->mbo_mode;
2410                         stx.stx_ino = cl_fid_build_ino(&body->mbo_fid1,
2411                                                        api32);
2412                         if (llcrypt_require_key(inode) == -ENOKEY)
2413                                 stx.stx_size = round_up(stx.stx_size,
2414                                                    LUSTRE_ENCRYPTION_UNIT_SIZE);
2415                         else
2416                                 stx.stx_size = body->mbo_size;
2417                         stx.stx_blocks = body->mbo_blocks;
2418                         stx.stx_atime.tv_sec = body->mbo_atime;
2419                         stx.stx_ctime.tv_sec = body->mbo_ctime;
2420                         stx.stx_mtime.tv_sec = body->mbo_mtime;
2421                         stx.stx_btime.tv_sec = body->mbo_btime;
2422                         stx.stx_rdev_major = MAJOR(body->mbo_rdev);
2423                         stx.stx_rdev_minor = MINOR(body->mbo_rdev);
2424                         stx.stx_dev_major = MAJOR(inode->i_sb->s_dev);
2425                         stx.stx_dev_minor = MINOR(inode->i_sb->s_dev);
2426                         stx.stx_mask |= STATX_BASIC_STATS | STATX_BTIME;
2427
2428                         stx.stx_attributes_mask = STATX_ATTR_IMMUTABLE |
2429                                                   STATX_ATTR_APPEND;
2430 #ifdef HAVE_LUSTRE_CRYPTO
2431                         stx.stx_attributes_mask |= STATX_ATTR_ENCRYPTED;
2432 #endif
2433                         if (body->mbo_valid & OBD_MD_FLFLAGS) {
2434                                 stx.stx_attributes |= body->mbo_flags;
2435                                 /* if Lustre specific LUSTRE_ENCRYPT_FL flag is
2436                                  * set, also set ext4 equivalent to please statx
2437                                  */
2438                                 if (body->mbo_flags & LUSTRE_ENCRYPT_FL)
2439                                         stx.stx_attributes |=
2440                                                 STATX_ATTR_ENCRYPTED;
2441                         }
2442
2443                         /* For a striped directory, the size and blocks returned
2444                          * from MDT is not correct.
2445                          * The size and blocks are aggregated by client across
2446                          * all stripes.
2447                          * Thus for a striped directory, do not return the valid
2448                          * FLSIZE and FLBLOCKS flags to the caller.
2449                          * However, this whould be better decided by the MDS
2450                          * instead of the client.
2451                          */
2452                         if (cmd == LL_IOC_MDC_GETINFO_V2 &&
2453                             ll_dir_striped(inode))
2454                                 valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
2455
2456                         if (flagsp && copy_to_user(flagsp, &valid,
2457                                                    sizeof(*flagsp)))
2458                                 GOTO(out_req, rc = -EFAULT);
2459
2460                         if (fidp && copy_to_user(fidp, &body->mbo_fid1,
2461                                                  sizeof(*fidp)))
2462                                 GOTO(out_req, rc = -EFAULT);
2463
2464                         if (!(valid & OBD_MD_FLSIZE))
2465                                 stx.stx_mask &= ~STATX_SIZE;
2466                         if (!(valid & OBD_MD_FLBLOCKS))
2467                                 stx.stx_mask &= ~STATX_BLOCKS;
2468
2469                         if (stxp && copy_to_user(stxp, &stx, sizeof(stx)))
2470                                 GOTO(out_req, rc = -EFAULT);
2471
2472                         if (lmmsizep && copy_to_user(lmmsizep, &lmmsize,
2473                                                      sizeof(*lmmsizep)))
2474                                 GOTO(out_req, rc = -EFAULT);
2475                 }
2476
2477                 EXIT;
2478 out_req:
2479                 ptlrpc_req_finished(request);
2480                 ptlrpc_req_finished(root_request);
2481                 if (filename)
2482                         ll_putname(filename);
2483                 return rc;
2484         }
2485         case OBD_IOC_QUOTACTL: {
2486                 struct if_quotactl *qctl;
2487                 int qctl_len = sizeof(*qctl) + LOV_MAXPOOLNAME + 1;
2488
2489                 OBD_ALLOC(qctl, qctl_len);
2490                 if (!qctl)
2491                         RETURN(-ENOMEM);
2492
2493                 if (copy_from_user(qctl, uarg, sizeof(*qctl)))
2494                         GOTO(out_quotactl, rc = -EFAULT);
2495
2496                 if (LUSTRE_Q_CMD_IS_POOL(qctl->qc_cmd)) {
2497                         char __user *from = uarg +
2498                                         offsetof(typeof(*qctl), qc_poolname);
2499                         if (copy_from_user(qctl->qc_poolname, from,
2500                                            LOV_MAXPOOLNAME + 1))
2501                                 GOTO(out_quotactl, rc = -EFAULT);
2502                 }
2503
2504                 rc = quotactl_ioctl(inode->i_sb, qctl);
2505                 if ((rc == 0 || rc == -ENODATA) &&
2506                     copy_to_user(uarg, qctl, sizeof(*qctl)))
2507                         rc = -EFAULT;
2508 out_quotactl:
2509                 OBD_FREE(qctl, qctl_len);
2510                 RETURN(rc);
2511         }
2512         case LL_IOC_GETOBDCOUNT: {
2513                 u32 count, vallen;
2514                 struct obd_export *exp;
2515
2516                 if (copy_from_user(&count, uarg, sizeof(count)))
2517                         RETURN(-EFAULT);
2518
2519                 /* get ost count when count is zero, get mdt count otherwise */
2520                 exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp;
2521                 vallen = sizeof(count);
2522                 rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT),
2523                                   KEY_TGT_COUNT, &vallen, &count);
2524                 if (rc) {
2525                         CERROR("%s: get target count failed: rc = %d\n",
2526                                sbi->ll_fsname, rc);
2527                         RETURN(rc);
2528                 }
2529
2530                 if (copy_to_user(uarg, &count, sizeof(count)))
2531                         RETURN(-EFAULT);
2532
2533                 RETURN(0);
2534         }
2535         case LL_IOC_GET_CONNECT_FLAGS:
2536                 RETURN(obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, uarg));
2537         case LL_IOC_FID2MDTIDX: {
2538                 struct obd_export *exp = ll_i2mdexp(inode);
2539                 struct lu_fid fid;
2540                 __u32 index;
2541
2542                 if (copy_from_user(&fid, uarg, sizeof(fid)))
2543                         RETURN(-EFAULT);
2544
2545                 /* Call mdc_iocontrol */
2546                 rc = obd_iocontrol(LL_IOC_FID2MDTIDX, exp, sizeof(fid), &fid,
2547                                    (__u32 __user *)&index);
2548                 if (rc != 0)
2549                         RETURN(rc);
2550
2551                 RETURN(index);
2552         }
2553         case LL_IOC_HSM_REQUEST: {
2554                 struct hsm_user_request *hur;
2555                 ssize_t totalsize;
2556
2557                 OBD_ALLOC_PTR(hur);
2558                 if (hur == NULL)
2559                         RETURN(-ENOMEM);
2560
2561                 /* We don't know the true size yet; copy the fixed-size part */
2562                 if (copy_from_user(hur, uarg, sizeof(*hur))) {
2563                         OBD_FREE_PTR(hur);
2564                         RETURN(-EFAULT);
2565                 }
2566
2567                 /* Compute the whole struct size */
2568                 totalsize = hur_len(hur);
2569                 OBD_FREE_PTR(hur);
2570                 if (totalsize < 0)
2571                         RETURN(-E2BIG);
2572
2573                 /* Final size will be more than double totalsize */
2574                 if (totalsize >= MDS_MAXREQSIZE / 3)
2575                         RETURN(-E2BIG);
2576
2577                 OBD_ALLOC_LARGE(hur, totalsize);
2578                 if (hur == NULL)
2579                         RETURN(-ENOMEM);
2580
2581                 /* Copy the whole struct */
2582                 if (copy_from_user(hur, uarg, totalsize))
2583                         GOTO(out_hur, rc = -EFAULT);
2584
2585                 if (hur->hur_request.hr_action == HUA_RELEASE) {
2586                         const struct lu_fid *fid;
2587                         struct inode *f;
2588                         int i;
2589
2590                         for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
2591                                 fid = &hur->hur_user_item[i].hui_fid;
2592                                 f = search_inode_for_lustre(inode->i_sb, fid);
2593                                 if (IS_ERR(f)) {
2594                                         rc = PTR_ERR(f);
2595                                         break;
2596                                 }
2597
2598                                 rc = ll_hsm_release(f);
2599                                 iput(f);
2600                                 if (rc != 0)
2601                                         break;
2602                         }
2603                 } else {
2604                         rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize,
2605                                            hur, NULL);
2606                 }
2607 out_hur:
2608                 OBD_FREE_LARGE(hur, totalsize);
2609
2610                 RETURN(rc);
2611         }
2612         case LL_IOC_HSM_PROGRESS: {
2613                 struct hsm_progress_kernel hpk;
2614                 struct hsm_progress hp;
2615
2616                 if (copy_from_user(&hp, uarg, sizeof(hp)))
2617                         RETURN(-EFAULT);
2618
2619                 hpk.hpk_fid = hp.hp_fid;
2620                 hpk.hpk_cookie = hp.hp_cookie;
2621                 hpk.hpk_extent = hp.hp_extent;
2622                 hpk.hpk_flags = hp.hp_flags;
2623                 hpk.hpk_errval = hp.hp_errval;
2624                 hpk.hpk_data_version = 0;
2625
2626                 /* File may not exist in Lustre; all progress
2627                  * reported to Lustre root
2628                  */
2629                 rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk,
2630                                    NULL);
2631                 RETURN(rc);
2632         }
2633         case LL_IOC_HSM_CT_START:
2634                 if (!capable(CAP_SYS_ADMIN))
2635                         RETURN(-EPERM);
2636
2637                 rc = copy_and_ct_start(cmd, sbi->ll_md_exp, uarg);
2638                 RETURN(rc);
2639
2640         case LL_IOC_HSM_COPY_START: {
2641                 struct hsm_copy *copy;
2642                 int rc;
2643
2644                 OBD_ALLOC_PTR(copy);
2645                 if (copy == NULL)
2646                         RETURN(-ENOMEM);
2647                 if (copy_from_user(copy, uarg, sizeof(*copy))) {
2648                         OBD_FREE_PTR(copy);
2649                         RETURN(-EFAULT);
2650                 }
2651
2652                 rc = ll_ioc_copy_start(inode->i_sb, copy);
2653                 if (copy_to_user(uarg, copy, sizeof(*copy)))
2654                         rc = -EFAULT;
2655
2656                 OBD_FREE_PTR(copy);
2657                 RETURN(rc);
2658         }
2659         case LL_IOC_HSM_COPY_END: {
2660                 struct hsm_copy *copy;
2661                 int rc;
2662
2663                 OBD_ALLOC_PTR(copy);
2664                 if (copy == NULL)
2665                         RETURN(-ENOMEM);
2666                 if (copy_from_user(copy, uarg, sizeof(*copy))) {
2667                         OBD_FREE_PTR(copy);
2668                         RETURN(-EFAULT);
2669                 }
2670
2671                 rc = ll_ioc_copy_end(inode->i_sb, copy);
2672                 if (copy_to_user(uarg, copy, sizeof(*copy)))
2673                         rc = -EFAULT;
2674
2675                 OBD_FREE_PTR(copy);
2676                 RETURN(rc);
2677         }
2678         case LL_IOC_MIGRATE: {
2679                 struct lmv_user_md *lum;
2680                 int len;
2681                 char *filename;
2682                 int namelen = 0;
2683                 __u32 flags;
2684                 int rc;
2685
2686                 rc = obd_ioctl_getdata(&data, &len, uarg);
2687                 if (rc)
2688                         RETURN(rc);
2689
2690                 if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
2691                     data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0)
2692                         GOTO(migrate_free, rc = -EINVAL);
2693
2694                 filename = data->ioc_inlbuf1;
2695                 namelen = data->ioc_inllen1;
2696                 flags = data->ioc_type;
2697
2698                 if (namelen < 1 || namelen != strlen(filename) + 1) {
2699                         CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
2700                         GOTO(migrate_free, rc = -EINVAL);
2701                 }
2702
2703                 lum = (struct lmv_user_md *)data->ioc_inlbuf2;
2704                 if (lum->lum_magic != LMV_USER_MAGIC &&
2705                     lum->lum_magic != LMV_USER_MAGIC_SPECIFIC) {
2706                         rc = -EINVAL;
2707                         CERROR("%s: wrong lum magic %x: rc = %d\n",
2708                                filename, lum->lum_magic, rc);
2709                         GOTO(migrate_free, rc);
2710                 }
2711
2712                 rc = ll_migrate(inode, file, lum, filename, flags);
2713 migrate_free:
2714                 OBD_FREE_LARGE(data, len);
2715
2716                 RETURN(rc);
2717         }
2718         case LL_IOC_LADVISE2: {
2719                 struct llapi_lu_ladvise2 *ladvise;
2720
2721                 OBD_ALLOC_PTR(ladvise);
2722                 if (ladvise == NULL)
2723                         RETURN(-ENOMEM);
2724
2725                 if (copy_from_user(ladvise, uarg, sizeof(*ladvise)))
2726                         GOTO(out_ladvise, rc = -EFAULT);
2727
2728                 switch (ladvise->lla_advice) {
2729                 case LU_LADVISE_AHEAD:
2730                         if (ladvise->lla_start >= ladvise->lla_end) {
2731                                 CDEBUG(D_VFSTRACE,
2732                                        "%s: Invalid range (%llu %llu) for %s\n",
2733                                        sbi->ll_fsname, ladvise->lla_start,
2734                                        ladvise->lla_end,
2735                                        ladvise_names[ladvise->lla_advice]);
2736                                 GOTO(out_ladvise, rc = -EINVAL);
2737                         }
2738
2739                         /*
2740                          * Currently we only support name indexing format
2741                          * ahead operations.
2742                          */
2743                         if (ladvise->lla_ahead_mode != LU_AH_NAME_INDEX) {
2744                                 CDEBUG(D_VFSTRACE,
2745                                        "%s: Invalid access mode (%d) for %s\n",
2746                                        sbi->ll_fsname, ladvise->lla_ahead_mode,
2747                                        ladvise_names[ladvise->lla_advice]);
2748                                 GOTO(out_ladvise, rc = -EINVAL);
2749                         }
2750
2751                         /* Currently we only support stat-ahead operations. */
2752                         if (!(ladvise->lla_access_flags & ACCESS_FL_STAT)) {
2753                                 CDEBUG(D_VFSTRACE,
2754                                        "%s: Invalid access flags (%x) for %s\n",
2755                                        sbi->ll_fsname,
2756                                        ladvise->lla_access_flags,
2757                                        ladvise_names[ladvise->lla_advice]);
2758                                 GOTO(out_ladvise, rc = -EINVAL);
2759                         }
2760
2761                         rc = ll_ioctl_ahead(file, ladvise);
2762                         break;
2763                 default:
2764                         rc = -EINVAL;
2765                 }
2766 out_ladvise:
2767                 OBD_FREE_PTR(ladvise);
2768                 RETURN(rc);
2769         }
2770         case LL_IOC_PCC_DETACH_BY_FID: {
2771                 struct lu_pcc_detach_fid *detach;
2772                 struct lu_fid *fid;
2773                 struct inode *inode2;
2774                 unsigned long ino;
2775
2776                 OBD_ALLOC_PTR(detach);
2777                 if (detach == NULL)
2778                         RETURN(-ENOMEM);
2779
2780                 if (copy_from_user(detach, uarg, sizeof(*detach)))
2781                         GOTO(out_detach, rc = -EFAULT);
2782
2783                 fid = &detach->pccd_fid;
2784                 ino = cl_fid_build_ino(fid, ll_need_32bit_api(sbi));
2785                 inode2 = ilookup5(inode->i_sb, ino, ll_test_inode_by_fid, fid);
2786                 if (inode2 == NULL)
2787                         /* Target inode is not in inode cache, and PCC file
2788                          * has aleady released, return immdiately.
2789                          */
2790                         GOTO(out_detach, rc = 0);
2791
2792                 if (!S_ISREG(inode2->i_mode))
2793                         GOTO(out_iput, rc = -EINVAL);
2794
2795                 if (!inode_owner_or_capable(&nop_mnt_idmap, inode2))
2796                         GOTO(out_iput, rc = -EPERM);
2797
2798                 rc = pcc_ioctl_detach(inode2, &detach->pccd_flags);
2799                 if (rc)
2800                         GOTO(out_iput, rc);
2801
2802                 if (copy_to_user((char __user *)arg, detach, sizeof(*detach)))
2803                         GOTO(out_iput, rc = -EFAULT);
2804 out_iput:
2805                 iput(inode2);
2806 out_detach:
2807                 OBD_FREE_PTR(detach);
2808                 RETURN(rc);
2809         }
2810         default:
2811                 rc = ll_iocontrol(inode, file, cmd, uarg);
2812                 if (rc != -ENOTTY)
2813                         RETURN(rc);
2814                 RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL, uarg));
2815         }
2816 }
2817
2818 static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
2819 {
2820         struct inode *inode = file->f_mapping->host;
2821         struct ll_file_data *fd = file->private_data;
2822         struct ll_sb_info *sbi = ll_i2sbi(inode);
2823         int api32 = ll_need_32bit_api(sbi);
2824         loff_t ret = -EINVAL;
2825
2826         ENTRY;
2827         ll_inode_lock(inode);
2828         switch (origin) {
2829         case SEEK_SET:
2830                 break;
2831         case SEEK_CUR:
2832                 offset += file->f_pos;
2833                 break;
2834         case SEEK_END:
2835                 if (offset > 0)
2836                         GOTO(out, ret);
2837                 if (api32)
2838                         offset += LL_DIR_END_OFF_32BIT;
2839                 else
2840                         offset += LL_DIR_END_OFF;
2841                 break;
2842         default:
2843                 GOTO(out, ret);
2844         }
2845
2846         if (offset >= 0 &&
2847             ((api32 && offset <= LL_DIR_END_OFF_32BIT) ||
2848              (!api32 && offset <= LL_DIR_END_OFF))) {
2849                 if (offset != file->f_pos) {
2850                         bool hash64;
2851
2852                         hash64 = test_bit(LL_SBI_64BIT_HASH, sbi->ll_flags);
2853                         if ((api32 && offset == LL_DIR_END_OFF_32BIT) ||
2854                             (!api32 && offset == LL_DIR_END_OFF))
2855                                 fd->lfd_pos = MDS_DIR_END_OFF;
2856                         else if (api32 && hash64)
2857                                 fd->lfd_pos = offset << 32;
2858                         else
2859                                 fd->lfd_pos = offset;
2860                         file->f_pos = offset;
2861                         file->f_version = 0;
2862                 }
2863                 ret = offset;
2864         }
2865         GOTO(out, ret);
2866
2867 out:
2868         ll_inode_unlock(inode);
2869         return ret;
2870 }
2871
2872 static int ll_dir_open(struct inode *inode, struct file *file)
2873 {
2874         ENTRY;
2875         RETURN(ll_file_open(inode, file));
2876 }
2877
2878 static int ll_dir_release(struct inode *inode, struct file *file)
2879 {
2880         ENTRY;
2881         RETURN(ll_file_release(inode, file));
2882 }
2883
2884 /* notify error if partially read striped directory */
2885 static int ll_dir_flush(struct file *file, fl_owner_t id)
2886 {
2887         struct ll_file_data *lfd = file->private_data;
2888         int rc = lfd->fd_partial_readdir_rc;
2889
2890         lfd->fd_partial_readdir_rc = 0;
2891
2892         return rc;
2893 }
2894
2895 const struct file_operations ll_dir_operations = {
2896         .llseek         = ll_dir_seek,
2897         .open           = ll_dir_open,
2898         .release        = ll_dir_release,
2899         .read           = generic_read_dir,
2900 #ifdef HAVE_DIR_CONTEXT
2901         .iterate_shared = ll_iterate,
2902 #else
2903         .readdir        = ll_readdir,
2904 #endif
2905         .unlocked_ioctl = ll_dir_ioctl,
2906         .fsync          = ll_fsync,
2907         .flush          = ll_dir_flush,
2908 };