Whamcloud - gitweb
LU-14490 lmv: striped directory as subdirectory mount
[fs/lustre-release.git] / lustre / mdt / mdt_handler.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2010, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  *
32  * lustre/mdt/mdt_handler.c
33  *
34  * Lustre Metadata Target (mdt) request handler
35  *
36  * Author: Peter Braam <braam@clusterfs.com>
37  * Author: Andreas Dilger <adilger@clusterfs.com>
38  * Author: Phil Schwan <phil@clusterfs.com>
39  * Author: Mike Shaver <shaver@clusterfs.com>
40  * Author: Nikita Danilov <nikita@clusterfs.com>
41  * Author: Huang Hua <huanghua@clusterfs.com>
42  * Author: Yury Umanets <umka@clusterfs.com>
43  */
44
45 #define DEBUG_SUBSYSTEM S_MDS
46
47 #include <linux/module.h>
48 #include <linux/pagemap.h>
49
50 #include <dt_object.h>
51 #include <lustre_acl.h>
52 #include <lustre_export.h>
53 #include <uapi/linux/lustre/lustre_ioctl.h>
54 #include <lustre_lfsck.h>
55 #include <lustre_log.h>
56 #include <lustre_nodemap.h>
57 #include <lustre_mds.h>
58 #include <uapi/linux/lustre/lustre_param.h>
59 #include <lustre_quota.h>
60 #include <lustre_swab.h>
61 #include <obd.h>
62 #include <obd_support.h>
63 #include <lustre_barrier.h>
64 #include <obd_cksum.h>
65 #include <llog_swab.h>
66
67 #include "mdt_internal.h"
68
69 static unsigned int max_mod_rpcs_per_client = 8;
70 module_param(max_mod_rpcs_per_client, uint, 0644);
71 MODULE_PARM_DESC(max_mod_rpcs_per_client, "maximum number of modify RPCs in flight allowed per client");
72
73 mdl_mode_t mdt_mdl_lock_modes[] = {
74         [LCK_MINMODE] = MDL_MINMODE,
75         [LCK_EX]      = MDL_EX,
76         [LCK_PW]      = MDL_PW,
77         [LCK_PR]      = MDL_PR,
78         [LCK_CW]      = MDL_CW,
79         [LCK_CR]      = MDL_CR,
80         [LCK_NL]      = MDL_NL,
81         [LCK_GROUP]   = MDL_GROUP
82 };
83
84 enum ldlm_mode mdt_dlm_lock_modes[] = {
85         [MDL_MINMODE]   = LCK_MINMODE,
86         [MDL_EX]        = LCK_EX,
87         [MDL_PW]        = LCK_PW,
88         [MDL_PR]        = LCK_PR,
89         [MDL_CW]        = LCK_CW,
90         [MDL_CR]        = LCK_CR,
91         [MDL_NL]        = LCK_NL,
92         [MDL_GROUP]     = LCK_GROUP
93 };
94
95 static struct mdt_device *mdt_dev(struct lu_device *d);
96
97 static const struct lu_object_operations mdt_obj_ops;
98
99 /* Slab for MDT object allocation */
100 static struct kmem_cache *mdt_object_kmem;
101
102 /* For HSM restore handles */
103 struct kmem_cache *mdt_hsm_cdt_kmem;
104
105 /* For HSM request handles */
106 struct kmem_cache *mdt_hsm_car_kmem;
107
108 static struct lu_kmem_descr mdt_caches[] = {
109         {
110                 .ckd_cache = &mdt_object_kmem,
111                 .ckd_name  = "mdt_obj",
112                 .ckd_size  = sizeof(struct mdt_object)
113         },
114         {
115                 .ckd_cache      = &mdt_hsm_cdt_kmem,
116                 .ckd_name       = "mdt_cdt_restore_handle",
117                 .ckd_size       = sizeof(struct cdt_restore_handle)
118         },
119         {
120                 .ckd_cache      = &mdt_hsm_car_kmem,
121                 .ckd_name       = "mdt_cdt_agent_req",
122                 .ckd_size       = sizeof(struct cdt_agent_req)
123         },
124         {
125                 .ckd_cache = NULL
126         }
127 };
128
129 __u64 mdt_get_disposition(struct ldlm_reply *rep, __u64 op_flag)
130 {
131         if (!rep)
132                 return 0;
133         return rep->lock_policy_res1 & op_flag;
134 }
135
136 void mdt_clear_disposition(struct mdt_thread_info *info,
137                            struct ldlm_reply *rep, __u64 op_flag)
138 {
139         if (info) {
140                 info->mti_opdata &= ~op_flag;
141                 tgt_opdata_clear(info->mti_env, op_flag);
142         }
143         if (rep)
144                 rep->lock_policy_res1 &= ~op_flag;
145 }
146
147 void mdt_set_disposition(struct mdt_thread_info *info,
148                          struct ldlm_reply *rep, __u64 op_flag)
149 {
150         if (info) {
151                 info->mti_opdata |= op_flag;
152                 tgt_opdata_set(info->mti_env, op_flag);
153         }
154         if (rep)
155                 rep->lock_policy_res1 |= op_flag;
156 }
157
158 void mdt_lock_reg_init(struct mdt_lock_handle *lh, enum ldlm_mode lm)
159 {
160         lh->mlh_pdo_hash = 0;
161         lh->mlh_reg_mode = lm;
162         lh->mlh_rreg_mode = lm;
163         lh->mlh_type = MDT_REG_LOCK;
164 }
165
166 void mdt_lock_pdo_init(struct mdt_lock_handle *lh, enum ldlm_mode lock_mode,
167                        const struct lu_name *lname)
168 {
169         lh->mlh_reg_mode = lock_mode;
170         lh->mlh_pdo_mode = LCK_MINMODE;
171         lh->mlh_rreg_mode = lock_mode;
172         lh->mlh_type = MDT_PDO_LOCK;
173
174         if (lu_name_is_valid(lname)) {
175                 lh->mlh_pdo_hash = ll_full_name_hash(NULL, lname->ln_name,
176                                                      lname->ln_namelen);
177                 /* XXX Workaround for LU-2856
178                  *
179                  * Zero is a valid return value of full_name_hash, but
180                  * several users of mlh_pdo_hash assume a non-zero
181                  * hash value. We therefore map zero onto an
182                  * arbitrary, but consistent value (1) to avoid
183                  * problems further down the road. */
184                 if (unlikely(lh->mlh_pdo_hash == 0))
185                         lh->mlh_pdo_hash = 1;
186         } else {
187                 lh->mlh_pdo_hash = 0;
188         }
189 }
190
191 static void mdt_lock_pdo_mode(struct mdt_thread_info *info, struct mdt_object *o,
192                               struct mdt_lock_handle *lh)
193 {
194         mdl_mode_t mode;
195         ENTRY;
196
197         /*
198          * Any dir access needs couple of locks:
199          *
200          * 1) on part of dir we gonna take lookup/modify;
201          *
202          * 2) on whole dir to protect it from concurrent splitting and/or to
203          * flush client's cache for readdir().
204          *
205          * so, for a given mode and object this routine decides what lock mode
206          * to use for lock #2:
207          *
208          * 1) if caller's gonna lookup in dir then we need to protect dir from
209          * being splitted only - LCK_CR
210          *
211          * 2) if caller's gonna modify dir then we need to protect dir from
212          * being splitted and to flush cache - LCK_CW
213          *
214          * 3) if caller's gonna modify dir and that dir seems ready for
215          * splitting then we need to protect it from any type of access
216          * (lookup/modify/split) - LCK_EX --bzzz
217          */
218
219         LASSERT(lh->mlh_reg_mode != LCK_MINMODE);
220         LASSERT(lh->mlh_pdo_mode == LCK_MINMODE);
221
222         /*
223          * Ask underlaying level its opinion about preferable PDO lock mode
224          * having access type passed as regular lock mode:
225          *
226          * - MDL_MINMODE means that lower layer does not want to specify lock
227          * mode;
228          *
229          * - MDL_NL means that no PDO lock should be taken. This is used in some
230          * cases. Say, for non-splittable directories no need to use PDO locks
231          * at all.
232          */
233         mode = mdo_lock_mode(info->mti_env, mdt_object_child(o),
234                              mdt_dlm_mode2mdl_mode(lh->mlh_reg_mode));
235
236         if (mode != MDL_MINMODE) {
237                 lh->mlh_pdo_mode = mdt_mdl_mode2dlm_mode(mode);
238         } else {
239                 /*
240                  * Lower layer does not want to specify locking mode. We do it
241                  * our selves. No special protection is needed, just flush
242                  * client's cache on modification and allow concurrent
243                  * mondification.
244                  */
245                 switch (lh->mlh_reg_mode) {
246                 case LCK_EX:
247                         lh->mlh_pdo_mode = LCK_EX;
248                         break;
249                 case LCK_PR:
250                         lh->mlh_pdo_mode = LCK_CR;
251                         break;
252                 case LCK_PW:
253                         lh->mlh_pdo_mode = LCK_CW;
254                         break;
255                 default:
256                         CERROR("Not expected lock type (0x%x)\n",
257                                (int)lh->mlh_reg_mode);
258                         LBUG();
259                 }
260         }
261
262         LASSERT(lh->mlh_pdo_mode != LCK_MINMODE);
263         EXIT;
264 }
265
266 static int mdt_lookup_fileset(struct mdt_thread_info *info, const char *fileset,
267                               struct lu_fid *fid)
268 {
269         struct mdt_device *mdt = info->mti_mdt;
270         struct lu_name *lname = &info->mti_name;
271         const char *start = fileset;
272         char *filename = info->mti_filename;
273         struct mdt_object *parent;
274         u32 mode;
275         int rc = 0;
276
277         LASSERT(!info->mti_cross_ref);
278
279         /*
280          * We may want to allow this to mount a completely separate
281          * fileset from the MDT in the future, but keeping it to
282          * ROOT/ only for now avoid potential security issues.
283          */
284         *fid = mdt->mdt_md_root_fid;
285
286         while (rc == 0 && start != NULL && *start != '\0') {
287                 const char *s1 = start;
288                 const char *s2;
289
290                 while (*++s1 == '/')
291                         ;
292                 s2 = s1;
293                 while (*s2 != '/' && *s2 != '\0')
294                         s2++;
295
296                 if (s2 == s1)
297                         break;
298
299                 start = s2;
300
301                 lname->ln_namelen = s2 - s1;
302                 if (lname->ln_namelen > NAME_MAX) {
303                         rc = -EINVAL;
304                         break;
305                 }
306
307                 /* reject .. as a path component */
308                 if (lname->ln_namelen == 2 &&
309                     strncmp(s1, "..", 2) == 0) {
310                         rc = -EINVAL;
311                         break;
312                 }
313
314                 strncpy(filename, s1, lname->ln_namelen);
315                 filename[lname->ln_namelen] = '\0';
316                 lname->ln_name = filename;
317
318                 parent = mdt_object_find(info->mti_env, mdt, fid);
319                 if (IS_ERR(parent)) {
320                         rc = PTR_ERR(parent);
321                         break;
322                 }
323                 /* Only got the fid of this obj by name */
324                 fid_zero(fid);
325                 rc = mdo_lookup(info->mti_env, mdt_object_child(parent), lname,
326                                 fid, &info->mti_spec);
327                 mdt_object_put(info->mti_env, parent);
328         }
329         if (!rc) {
330                 parent = mdt_object_find(info->mti_env, mdt, fid);
331                 if (IS_ERR(parent))
332                         rc = PTR_ERR(parent);
333                 else {
334                         mode = lu_object_attr(&parent->mot_obj);
335                         if (!S_ISDIR(mode)) {
336                                 rc = -ENOTDIR;
337                         } else if (mdt_is_remote_object(info, parent, parent)) {
338                                 if (!mdt->mdt_enable_remote_subdir_mount) {
339                                         rc = -EREMOTE;
340                                         LCONSOLE_WARN("%s: subdir mount '%s' refused because 'enable_remote_subdir_mount=0': rc = %d\n",
341                                                       mdt_obd_name(mdt),
342                                                       fileset, rc);
343                                 } else {
344                                         LCONSOLE_INFO("%s: subdir mount '%s' is remote and may be slow\n",
345                                                       mdt_obd_name(mdt),
346                                                       fileset);
347                                 }
348                         }
349                         mdt_object_put(info->mti_env, parent);
350                 }
351         }
352
353         return rc;
354 }
355
356 static int mdt_get_root(struct tgt_session_info *tsi)
357 {
358         struct mdt_thread_info  *info = tsi2mdt_info(tsi);
359         struct mdt_device       *mdt = info->mti_mdt;
360         struct mdt_body         *repbody;
361         char                    *fileset = NULL, *buffer = NULL;
362         int                      rc;
363         struct obd_export       *exp = info->mti_exp;
364         char                    *nodemap_fileset;
365
366         ENTRY;
367
368         rc = mdt_check_ucred(info);
369         if (rc)
370                 GOTO(out, rc = err_serious(rc));
371
372         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GET_ROOT_PACK))
373                 GOTO(out, rc = err_serious(-ENOMEM));
374
375         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
376         if (req_capsule_get_size(info->mti_pill, &RMF_NAME, RCL_CLIENT) > 0) {
377                 fileset = req_capsule_client_get(info->mti_pill, &RMF_NAME);
378                 if (fileset == NULL)
379                         GOTO(out, rc = err_serious(-EFAULT));
380         }
381
382         nodemap_fileset = nodemap_get_fileset(exp->exp_target_data.ted_nodemap);
383         if (nodemap_fileset && nodemap_fileset[0]) {
384                 CDEBUG(D_INFO, "nodemap fileset is %s\n", nodemap_fileset);
385                 if (fileset) {
386                         /* consider fileset from client as a sub-fileset
387                          * of the nodemap one */
388                         OBD_ALLOC(buffer, PATH_MAX + 1);
389                         if (buffer == NULL)
390                                 GOTO(out, rc = err_serious(-ENOMEM));
391                         if (snprintf(buffer, PATH_MAX + 1, "%s/%s",
392                                      nodemap_fileset, fileset) >= PATH_MAX + 1)
393                                 GOTO(out, rc = err_serious(-EINVAL));
394                         fileset = buffer;
395                 } else {
396                         /* enforce fileset as specified in the nodemap */
397                         fileset = nodemap_fileset;
398                 }
399         }
400
401         if (fileset) {
402                 CDEBUG(D_INFO, "Getting fileset %s\n", fileset);
403                 rc = mdt_lookup_fileset(info, fileset, &repbody->mbo_fid1);
404                 if (rc < 0)
405                         GOTO(out, rc = err_serious(rc));
406         } else {
407                 repbody->mbo_fid1 = mdt->mdt_md_root_fid;
408         }
409         repbody->mbo_valid |= OBD_MD_FLID;
410
411         EXIT;
412 out:
413         mdt_thread_info_fini(info);
414         if (buffer)
415                 OBD_FREE(buffer, PATH_MAX+1);
416         return rc;
417 }
418
419 static int mdt_statfs(struct tgt_session_info *tsi)
420 {
421         struct ptlrpc_request *req = tgt_ses_req(tsi);
422         struct mdt_thread_info *info = tsi2mdt_info(tsi);
423         struct mdt_device *mdt = info->mti_mdt;
424         struct tg_grants_data *tgd = &mdt->mdt_lut.lut_tgd;
425         struct md_device *next = mdt->mdt_child;
426         struct ptlrpc_service_part *svcpt;
427         struct obd_statfs *osfs;
428         struct mdt_body *reqbody = NULL;
429         struct mdt_statfs_cache *msf;
430         int rc;
431
432         ENTRY;
433
434         svcpt = req->rq_rqbd->rqbd_svcpt;
435
436         /* This will trigger a watchdog timeout */
437         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_STATFS_LCW_SLEEP,
438                          (MDT_SERVICE_WATCHDOG_FACTOR *
439                           at_get(&svcpt->scp_at_estimate)) + 1);
440
441         rc = mdt_check_ucred(info);
442         if (rc)
443                 GOTO(out, rc = err_serious(rc));
444
445         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_STATFS_PACK))
446                 GOTO(out, rc = err_serious(-ENOMEM));
447
448         osfs = req_capsule_server_get(info->mti_pill, &RMF_OBD_STATFS);
449         if (!osfs)
450                 GOTO(out, rc = -EPROTO);
451
452         if (mdt_is_sum_statfs_client(req->rq_export) &&
453                 lustre_packed_msg_size(req->rq_reqmsg) ==
454                 req_capsule_fmt_size(req->rq_reqmsg->lm_magic,
455                                      &RQF_MDS_STATFS_NEW, RCL_CLIENT)) {
456                 req_capsule_extend(info->mti_pill, &RQF_MDS_STATFS_NEW);
457                 reqbody = req_capsule_client_get(info->mti_pill, &RMF_MDT_BODY);
458         }
459
460         if (reqbody && reqbody->mbo_valid & OBD_MD_FLAGSTATFS)
461                 msf = &mdt->mdt_sum_osfs;
462         else
463                 msf = &mdt->mdt_osfs;
464
465         if (msf->msf_age + OBD_STATFS_CACHE_SECONDS <= ktime_get_seconds()) {
466                         /** statfs data is too old, get up-to-date one */
467                         if (reqbody && reqbody->mbo_valid & OBD_MD_FLAGSTATFS)
468                                 rc = next->md_ops->mdo_statfs(info->mti_env,
469                                                               next, osfs);
470                         else
471                                 rc = dt_statfs(info->mti_env, mdt->mdt_bottom,
472                                                osfs);
473                         if (rc)
474                                 GOTO(out, rc);
475                         spin_lock(&mdt->mdt_lock);
476                         msf->msf_osfs = *osfs;
477                         msf->msf_age = ktime_get_seconds();
478                         spin_unlock(&mdt->mdt_lock);
479         } else {
480                         /** use cached statfs data */
481                         spin_lock(&mdt->mdt_lock);
482                         *osfs = msf->msf_osfs;
483                         spin_unlock(&mdt->mdt_lock);
484         }
485
486         /* at least try to account for cached pages.  its still racy and
487          * might be under-reporting if clients haven't announced their
488          * caches with brw recently */
489         CDEBUG(D_SUPER | D_CACHE, "blocks cached %llu granted %llu"
490                " pending %llu free %llu avail %llu\n",
491                tgd->tgd_tot_dirty, tgd->tgd_tot_granted,
492                tgd->tgd_tot_pending,
493                osfs->os_bfree << tgd->tgd_blockbits,
494                osfs->os_bavail << tgd->tgd_blockbits);
495
496         osfs->os_bavail -= min_t(u64, osfs->os_bavail,
497                                  ((tgd->tgd_tot_dirty + tgd->tgd_tot_pending +
498                                    osfs->os_bsize - 1) >> tgd->tgd_blockbits));
499
500         tgt_grant_sanity_check(mdt->mdt_lu_dev.ld_obd, __func__);
501         CDEBUG(D_CACHE, "%llu blocks: %llu free, %llu avail; "
502                "%llu objects: %llu free; state %x\n",
503                osfs->os_blocks, osfs->os_bfree, osfs->os_bavail,
504                osfs->os_files, osfs->os_ffree, osfs->os_state);
505
506         if (!exp_grant_param_supp(tsi->tsi_exp) &&
507             tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) {
508                 /* clients which don't support OBD_CONNECT_GRANT_PARAM
509                  * should not see a block size > page size, otherwise
510                  * cl_lost_grant goes mad. Therefore, we emulate a 4KB (=2^12)
511                  * block size which is the biggest block size known to work
512                  * with all client's page size. */
513                 osfs->os_blocks <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
514                 osfs->os_bfree  <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
515                 osfs->os_bavail <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
516                 osfs->os_bsize = 1 << COMPAT_BSIZE_SHIFT;
517         }
518         if (rc == 0)
519                 mdt_counter_incr(req, LPROC_MDT_STATFS);
520 out:
521         mdt_thread_info_fini(info);
522         RETURN(rc);
523 }
524
525 /**
526  * Pack size attributes into the reply.
527  */
528 int mdt_pack_size2body(struct mdt_thread_info *info,
529                         const struct lu_fid *fid, struct lustre_handle *lh)
530 {
531         struct mdt_body *b;
532         struct md_attr *ma = &info->mti_attr;
533         int dom_stripe;
534         bool dom_lock = false;
535
536         ENTRY;
537
538         LASSERT(ma->ma_attr.la_valid & LA_MODE);
539
540         if (!S_ISREG(ma->ma_attr.la_mode) ||
541             !(ma->ma_valid & MA_LOV && ma->ma_lmm != NULL))
542                 RETURN(-ENODATA);
543
544         dom_stripe = mdt_lmm_dom_entry(ma->ma_lmm);
545         /* no DoM stripe, no size in reply */
546         if (dom_stripe == LMM_NO_DOM)
547                 RETURN(-ENOENT);
548
549         if (lustre_handle_is_used(lh)) {
550                 struct ldlm_lock *lock;
551
552                 lock = ldlm_handle2lock(lh);
553                 if (lock != NULL) {
554                         dom_lock = ldlm_has_dom(lock);
555                         LDLM_LOCK_PUT(lock);
556                 }
557         }
558
559         /* no DoM lock, no size in reply */
560         if (!dom_lock)
561                 RETURN(0);
562
563         /* Either DoM lock exists or LMM has only DoM stripe then
564          * return size on body. */
565         b = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
566
567         mdt_dom_object_size(info->mti_env, info->mti_mdt, fid, b, dom_lock);
568         RETURN(0);
569 }
570
571 #ifdef CONFIG_FS_POSIX_ACL
572 /*
573  * Pack ACL data into the reply. UIDs/GIDs are mapped and filtered by nodemap.
574  *
575  * \param       info    thread info object
576  * \param       repbody reply to pack ACLs into
577  * \param       o       mdt object of file to examine
578  * \param       nodemap nodemap of client to reply to
579  * \retval      0       success
580  * \retval      -errno  error getting or parsing ACL from disk
581  */
582 int mdt_pack_acl2body(struct mdt_thread_info *info, struct mdt_body *repbody,
583                       struct mdt_object *o, struct lu_nodemap *nodemap)
584 {
585         const struct lu_env     *env = info->mti_env;
586         struct md_object        *next = mdt_object_child(o);
587         struct lu_buf           *buf = &info->mti_buf;
588         struct mdt_device       *mdt = info->mti_mdt;
589         struct req_capsule *pill = info->mti_pill;
590         int rc;
591
592         ENTRY;
593
594         buf->lb_buf = req_capsule_server_get(pill, &RMF_ACL);
595         buf->lb_len = req_capsule_get_size(pill, &RMF_ACL, RCL_SERVER);
596         if (buf->lb_len == 0)
597                 RETURN(0);
598
599 again:
600         rc = mo_xattr_get(env, next, buf, XATTR_NAME_ACL_ACCESS);
601         if (rc < 0) {
602                 if (rc == -ENODATA) {
603                         repbody->mbo_aclsize = 0;
604                         repbody->mbo_valid |= OBD_MD_FLACL;
605                         rc = 0;
606                 } else if (rc == -EOPNOTSUPP) {
607                         rc = 0;
608                 } else {
609                         if (rc == -ERANGE &&
610                             exp_connect_large_acl(info->mti_exp) &&
611                             buf->lb_buf != info->mti_big_acl) {
612                                 if (info->mti_big_acl == NULL) {
613                                         OBD_ALLOC_LARGE(info->mti_big_acl,
614                                                         mdt->mdt_max_ea_size);
615                                         if (info->mti_big_acl == NULL) {
616                                                 CERROR("%s: unable to grow "
617                                                        DFID" ACL buffer\n",
618                                                        mdt_obd_name(mdt),
619                                                        PFID(mdt_object_fid(o)));
620                                                 RETURN(-ENOMEM);
621                                         }
622
623                                         info->mti_big_aclsize =
624                                                         mdt->mdt_max_ea_size;
625                                 }
626
627                                 CDEBUG(D_INODE, "%s: grow the "DFID
628                                        " ACL buffer to size %d\n",
629                                        mdt_obd_name(mdt),
630                                        PFID(mdt_object_fid(o)),
631                                        mdt->mdt_max_ea_size);
632
633                                 buf->lb_buf = info->mti_big_acl;
634                                 buf->lb_len = info->mti_big_aclsize;
635
636                                 goto again;
637                         }
638
639                         CERROR("%s: unable to read "DFID" ACL: rc = %d\n",
640                                mdt_obd_name(mdt), PFID(mdt_object_fid(o)), rc);
641                 }
642         } else {
643                 int client;
644                 int server;
645                 int acl_buflen;
646                 int lmm_buflen = 0;
647                 int lmmsize = 0;
648
649                 acl_buflen = req_capsule_get_size(pill, &RMF_ACL, RCL_SERVER);
650                 if (acl_buflen >= rc)
651                         goto map;
652
653                 /* If LOV/LMA EA is small, we can reuse part of their buffer */
654                 client = ptlrpc_req_get_repsize(pill->rc_req);
655                 server = lustre_packed_msg_size(pill->rc_req->rq_repmsg);
656                 if (req_capsule_has_field(pill, &RMF_MDT_MD, RCL_SERVER)) {
657                         lmm_buflen = req_capsule_get_size(pill, &RMF_MDT_MD,
658                                                           RCL_SERVER);
659                         lmmsize = repbody->mbo_eadatasize;
660                 }
661
662                 if (client < server - acl_buflen - lmm_buflen + rc + lmmsize) {
663                         CDEBUG(D_INODE, "%s: client prepared buffer size %d "
664                                "is not big enough with the ACL size %d (%d)\n",
665                                mdt_obd_name(mdt), client, rc,
666                                server - acl_buflen - lmm_buflen + rc + lmmsize);
667                         repbody->mbo_aclsize = 0;
668                         repbody->mbo_valid &= ~OBD_MD_FLACL;
669                         RETURN(-ERANGE);
670                 }
671
672 map:
673                 if (buf->lb_buf == info->mti_big_acl)
674                         info->mti_big_acl_used = 1;
675
676                 rc = nodemap_map_acl(nodemap, buf->lb_buf,
677                                      rc, NODEMAP_FS_TO_CLIENT);
678                 /* if all ACLs mapped out, rc is still >= 0 */
679                 if (rc < 0) {
680                         CERROR("%s: nodemap_map_acl unable to parse "DFID
681                                " ACL: rc = %d\n", mdt_obd_name(mdt),
682                                PFID(mdt_object_fid(o)), rc);
683                         repbody->mbo_aclsize = 0;
684                         repbody->mbo_valid &= ~OBD_MD_FLACL;
685                 } else {
686                         repbody->mbo_aclsize = rc;
687                         repbody->mbo_valid |= OBD_MD_FLACL;
688                         rc = 0;
689                 }
690         }
691
692         RETURN(rc);
693 }
694 #endif
695
696 /* XXX Look into layout in MDT layer. */
697 static inline bool mdt_hsm_is_released(struct lov_mds_md *lmm)
698 {
699         struct lov_comp_md_v1   *comp_v1;
700         struct lov_mds_md       *v1;
701         int                      i;
702
703         if (lmm->lmm_magic == LOV_MAGIC_COMP_V1) {
704                 comp_v1 = (struct lov_comp_md_v1 *)lmm;
705
706                 for (i = 0; i < comp_v1->lcm_entry_count; i++) {
707                         v1 = (struct lov_mds_md *)((char *)comp_v1 +
708                                 comp_v1->lcm_entries[i].lcme_offset);
709                         /* We don't support partial release for now */
710                         if (!(v1->lmm_pattern & LOV_PATTERN_F_RELEASED))
711                                 return false;
712                 }
713                 return true;
714         } else {
715                 return (lmm->lmm_pattern & LOV_PATTERN_F_RELEASED) ?
716                         true : false;
717         }
718 }
719
720 void mdt_pack_attr2body(struct mdt_thread_info *info, struct mdt_body *b,
721                         const struct lu_attr *attr, const struct lu_fid *fid)
722 {
723         struct md_attr *ma = &info->mti_attr;
724         struct obd_export *exp = info->mti_exp;
725         struct lu_nodemap *nodemap = NULL;
726
727         LASSERT(ma->ma_valid & MA_INODE);
728
729         if (attr->la_valid & LA_ATIME) {
730                 b->mbo_atime = attr->la_atime;
731                 b->mbo_valid |= OBD_MD_FLATIME;
732         }
733         if (attr->la_valid & LA_MTIME) {
734                 b->mbo_mtime = attr->la_mtime;
735                 b->mbo_valid |= OBD_MD_FLMTIME;
736         }
737         if (attr->la_valid & LA_CTIME) {
738                 b->mbo_ctime = attr->la_ctime;
739                 b->mbo_valid |= OBD_MD_FLCTIME;
740         }
741         if (attr->la_valid & LA_FLAGS) {
742                 b->mbo_flags = attr->la_flags;
743                 b->mbo_valid |= OBD_MD_FLFLAGS;
744         }
745         if (attr->la_valid & LA_NLINK) {
746                 b->mbo_nlink = attr->la_nlink;
747                 b->mbo_valid |= OBD_MD_FLNLINK;
748         }
749         if (attr->la_valid & (LA_UID|LA_GID)) {
750                 nodemap = nodemap_get_from_exp(exp);
751                 if (IS_ERR(nodemap))
752                         goto out;
753         }
754         if (attr->la_valid & LA_UID) {
755                 b->mbo_uid = nodemap_map_id(nodemap, NODEMAP_UID,
756                                             NODEMAP_FS_TO_CLIENT,
757                                             attr->la_uid);
758                 b->mbo_valid |= OBD_MD_FLUID;
759         }
760         if (attr->la_valid & LA_GID) {
761                 b->mbo_gid = nodemap_map_id(nodemap, NODEMAP_GID,
762                                             NODEMAP_FS_TO_CLIENT,
763                                             attr->la_gid);
764                 b->mbo_valid |= OBD_MD_FLGID;
765         }
766
767         if (attr->la_valid & LA_PROJID) {
768                 /* TODO, nodemap for project id */
769                 b->mbo_projid = attr->la_projid;
770                 b->mbo_valid |= OBD_MD_FLPROJID;
771         }
772
773         b->mbo_mode = attr->la_mode;
774         if (attr->la_valid & LA_MODE)
775                 b->mbo_valid |= OBD_MD_FLMODE;
776         if (attr->la_valid & LA_TYPE)
777                 b->mbo_valid |= OBD_MD_FLTYPE;
778
779         if (fid != NULL) {
780                 b->mbo_fid1 = *fid;
781                 b->mbo_valid |= OBD_MD_FLID;
782                 CDEBUG(D_INODE, DFID": nlink=%d, mode=%o, valid=%#llx\n",
783                        PFID(fid), b->mbo_nlink, b->mbo_mode, b->mbo_valid);
784         }
785
786         if (!(attr->la_valid & LA_TYPE))
787                 return;
788
789         b->mbo_rdev   = attr->la_rdev;
790         b->mbo_size   = attr->la_size;
791         b->mbo_blocks = attr->la_blocks;
792
793         if (!S_ISREG(attr->la_mode)) {
794                 b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLRDEV;
795         } else if (ma->ma_need & MA_LOV && !(ma->ma_valid & MA_LOV)) {
796                 /* means no objects are allocated on osts. */
797                 LASSERT(!(ma->ma_valid & MA_LOV));
798                 /* just ignore blocks occupied by extend attributes on MDS */
799                 b->mbo_blocks = 0;
800                 /* if no object is allocated on osts, the size on mds is valid.
801                  * b=22272 */
802                 b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
803         } else if ((ma->ma_valid & MA_LOV) && ma->ma_lmm != NULL) {
804                 if (mdt_hsm_is_released(ma->ma_lmm)) {
805                         /* A released file stores its size on MDS. */
806                         /* But return 1 block for released file, unless tools
807                          * like tar will consider it fully sparse. (LU-3864)
808                          */
809                         if (unlikely(b->mbo_size == 0))
810                                 b->mbo_blocks = 0;
811                         else
812                                 b->mbo_blocks = 1;
813                         b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
814                 } else if (info->mti_som_valid) { /* som is valid */
815                         b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
816                 } else if (ma->ma_valid & MA_SOM) { /* lsom is valid */
817                         b->mbo_valid |= OBD_MD_FLLAZYSIZE | OBD_MD_FLLAZYBLOCKS;
818                         b->mbo_size = ma->ma_som.ms_size;
819                         b->mbo_blocks = ma->ma_som.ms_blocks;
820                 }
821         }
822
823         if (fid != NULL && (b->mbo_valid & OBD_MD_FLSIZE ||
824                             b->mbo_valid & OBD_MD_FLLAZYSIZE))
825                 CDEBUG(D_VFSTRACE, DFID": returning size %llu\n",
826                        PFID(fid), (unsigned long long)b->mbo_size);
827
828 out:
829         if (!IS_ERR_OR_NULL(nodemap))
830                 nodemap_putref(nodemap);
831 }
832
833 static inline int mdt_body_has_lov(const struct lu_attr *la,
834                                    const struct mdt_body *body)
835 {
836         return (S_ISREG(la->la_mode) && (body->mbo_valid & OBD_MD_FLEASIZE)) ||
837                (S_ISDIR(la->la_mode) && (body->mbo_valid & OBD_MD_FLDIREA));
838 }
839
840 void mdt_client_compatibility(struct mdt_thread_info *info)
841 {
842         struct mdt_body       *body;
843         struct ptlrpc_request *req = mdt_info_req(info);
844         struct obd_export     *exp = req->rq_export;
845         struct md_attr        *ma = &info->mti_attr;
846         struct lu_attr        *la = &ma->ma_attr;
847         ENTRY;
848
849         if (exp_connect_layout(exp))
850                 /* the client can deal with 16-bit lmm_stripe_count */
851                 RETURN_EXIT;
852
853         body = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
854
855         if (!mdt_body_has_lov(la, body))
856                 RETURN_EXIT;
857
858         /* now we have a reply with a lov for a client not compatible with the
859          * layout lock so we have to clean the layout generation number */
860         if (S_ISREG(la->la_mode))
861                 ma->ma_lmm->lmm_layout_gen = 0;
862         EXIT;
863 }
864
865 static int mdt_attr_get_eabuf_size(struct mdt_thread_info *info,
866                                    struct mdt_object *o)
867 {
868         const struct lu_env *env = info->mti_env;
869         int rc, rc2;
870
871         rc = mo_xattr_get(env, mdt_object_child(o), &LU_BUF_NULL,
872                           XATTR_NAME_LOV);
873
874         if (rc == -ENODATA)
875                 rc = 0;
876
877         if (rc < 0)
878                 goto out;
879
880         /* Is it a directory? Let's check for the LMV as well */
881         if (S_ISDIR(lu_object_attr(&mdt_object_child(o)->mo_lu))) {
882                 rc2 = mo_xattr_get(env, mdt_object_child(o), &LU_BUF_NULL,
883                                    XATTR_NAME_LMV);
884
885                 if (rc2 == -ENODATA)
886                         rc2 = mo_xattr_get(env, mdt_object_child(o),
887                                            &LU_BUF_NULL,
888                                            XATTR_NAME_DEFAULT_LMV);
889
890                 if ((rc2 < 0 && rc2 != -ENODATA) || (rc2 > rc))
891                         rc = rc2;
892         }
893
894 out:
895         return rc;
896 }
897
898 int mdt_big_xattr_get(struct mdt_thread_info *info, struct mdt_object *o,
899                       const char *name)
900 {
901         const struct lu_env *env = info->mti_env;
902         int rc;
903         ENTRY;
904
905         LASSERT(info->mti_big_lmm_used == 0);
906         rc = mo_xattr_get(env, mdt_object_child(o), &LU_BUF_NULL, name);
907         if (rc < 0)
908                 RETURN(rc);
909
910         /* big_lmm may need to be grown */
911         if (info->mti_big_lmmsize < rc) {
912                 int size = size_roundup_power2(rc);
913
914                 if (info->mti_big_lmmsize > 0) {
915                         /* free old buffer */
916                         LASSERT(info->mti_big_lmm);
917                         OBD_FREE_LARGE(info->mti_big_lmm,
918                                        info->mti_big_lmmsize);
919                         info->mti_big_lmm = NULL;
920                         info->mti_big_lmmsize = 0;
921                 }
922
923                 OBD_ALLOC_LARGE(info->mti_big_lmm, size);
924                 if (info->mti_big_lmm == NULL)
925                         RETURN(-ENOMEM);
926                 info->mti_big_lmmsize = size;
927         }
928         LASSERT(info->mti_big_lmmsize >= rc);
929
930         info->mti_buf.lb_buf = info->mti_big_lmm;
931         info->mti_buf.lb_len = info->mti_big_lmmsize;
932         rc = mo_xattr_get(env, mdt_object_child(o), &info->mti_buf, name);
933
934         RETURN(rc);
935 }
936
937 int __mdt_stripe_get(struct mdt_thread_info *info, struct mdt_object *o,
938                      struct md_attr *ma, const char *name)
939 {
940         struct md_object *next = mdt_object_child(o);
941         struct lu_buf    *buf = &info->mti_buf;
942         int rc;
943
944         if (strcmp(name, XATTR_NAME_LOV) == 0) {
945                 buf->lb_buf = ma->ma_lmm;
946                 buf->lb_len = ma->ma_lmm_size;
947                 LASSERT(!(ma->ma_valid & MA_LOV));
948         } else if (strcmp(name, XATTR_NAME_LMV) == 0) {
949                 buf->lb_buf = ma->ma_lmv;
950                 buf->lb_len = ma->ma_lmv_size;
951                 LASSERT(!(ma->ma_valid & MA_LMV));
952         } else if (strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0) {
953                 buf->lb_buf = ma->ma_lmv;
954                 buf->lb_len = ma->ma_lmv_size;
955                 LASSERT(!(ma->ma_valid & MA_LMV_DEF));
956         } else {
957                 return -EINVAL;
958         }
959
960         LASSERT(buf->lb_buf);
961
962         rc = mo_xattr_get(info->mti_env, next, buf, name);
963         if (rc > 0) {
964
965 got:
966                 if (strcmp(name, XATTR_NAME_LOV) == 0) {
967                         if (info->mti_big_lmm_used)
968                                 ma->ma_lmm = info->mti_big_lmm;
969
970                         /* NOT return LOV EA with hole to old client. */
971                         if (unlikely(le32_to_cpu(ma->ma_lmm->lmm_pattern) &
972                                      LOV_PATTERN_F_HOLE) &&
973                             !(exp_connect_flags(info->mti_exp) &
974                               OBD_CONNECT_LFSCK)) {
975                                 return -EIO;
976                         } else {
977                                 ma->ma_lmm_size = rc;
978                                 ma->ma_valid |= MA_LOV;
979                         }
980                 } else if (strcmp(name, XATTR_NAME_LMV) == 0) {
981                         if (info->mti_big_lmm_used)
982                                 ma->ma_lmv = info->mti_big_lmm;
983
984                         ma->ma_lmv_size = rc;
985                         ma->ma_valid |= MA_LMV;
986                 } else if (strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0) {
987                         ma->ma_lmv_size = rc;
988                         ma->ma_valid |= MA_LMV_DEF;
989                 }
990
991                 /* Update mdt_max_mdsize so all clients will be aware that */
992                 if (info->mti_mdt->mdt_max_mdsize < rc)
993                         info->mti_mdt->mdt_max_mdsize = rc;
994
995                 rc = 0;
996         } else if (rc == -ENODATA) {
997                 /* no LOV EA */
998                 rc = 0;
999         } else if (rc == -ERANGE) {
1000                 /* Default LMV has fixed size, so it must be able to fit
1001                  * in the original buffer */
1002                 if (strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0)
1003                         return rc;
1004                 rc = mdt_big_xattr_get(info, o, name);
1005                 if (rc > 0) {
1006                         info->mti_big_lmm_used = 1;
1007                         goto got;
1008                 }
1009         }
1010
1011         return rc;
1012 }
1013
1014 int mdt_stripe_get(struct mdt_thread_info *info, struct mdt_object *o,
1015                    struct md_attr *ma, const char *name)
1016 {
1017         int rc;
1018
1019         if (!info->mti_big_lmm) {
1020                 OBD_ALLOC(info->mti_big_lmm, PAGE_SIZE);
1021                 if (!info->mti_big_lmm)
1022                         return -ENOMEM;
1023                 info->mti_big_lmmsize = PAGE_SIZE;
1024         }
1025
1026         if (strcmp(name, XATTR_NAME_LOV) == 0) {
1027                 ma->ma_lmm = info->mti_big_lmm;
1028                 ma->ma_lmm_size = info->mti_big_lmmsize;
1029                 ma->ma_valid &= ~MA_LOV;
1030         } else if (strcmp(name, XATTR_NAME_LMV) == 0) {
1031                 ma->ma_lmv = info->mti_big_lmm;
1032                 ma->ma_lmv_size = info->mti_big_lmmsize;
1033                 ma->ma_valid &= ~MA_LMV;
1034         } else {
1035                 LBUG();
1036         }
1037
1038         LASSERT(!info->mti_big_lmm_used);
1039         rc = __mdt_stripe_get(info, o, ma, name);
1040         /* since big_lmm is always used here, clear 'used' flag to avoid
1041          * assertion in mdt_big_xattr_get().
1042          */
1043         info->mti_big_lmm_used = 0;
1044
1045         return rc;
1046 }
1047
1048 int mdt_attr_get_pfid(struct mdt_thread_info *info, struct mdt_object *o,
1049                       struct lu_fid *pfid)
1050 {
1051         struct lu_buf           *buf = &info->mti_buf;
1052         struct link_ea_header   *leh;
1053         struct link_ea_entry    *lee;
1054         int                      rc;
1055         ENTRY;
1056
1057         buf->lb_buf = info->mti_big_lmm;
1058         buf->lb_len = info->mti_big_lmmsize;
1059         rc = mo_xattr_get(info->mti_env, mdt_object_child(o),
1060                           buf, XATTR_NAME_LINK);
1061         /* ignore errors, MA_PFID won't be set and it is
1062          * up to the caller to treat this as an error */
1063         if (rc == -ERANGE || buf->lb_len == 0) {
1064                 rc = mdt_big_xattr_get(info, o, XATTR_NAME_LINK);
1065                 buf->lb_buf = info->mti_big_lmm;
1066                 buf->lb_len = info->mti_big_lmmsize;
1067         }
1068
1069         if (rc < 0)
1070                 RETURN(rc);
1071         if (rc < sizeof(*leh)) {
1072                 CERROR("short LinkEA on "DFID": rc = %d\n",
1073                        PFID(mdt_object_fid(o)), rc);
1074                 RETURN(-ENODATA);
1075         }
1076
1077         leh = (struct link_ea_header *) buf->lb_buf;
1078         lee = (struct link_ea_entry *)(leh + 1);
1079         if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) {
1080                 leh->leh_magic = LINK_EA_MAGIC;
1081                 leh->leh_reccount = __swab32(leh->leh_reccount);
1082                 leh->leh_len = __swab64(leh->leh_len);
1083         }
1084         if (leh->leh_magic != LINK_EA_MAGIC)
1085                 RETURN(-EINVAL);
1086         if (leh->leh_reccount == 0)
1087                 RETURN(-ENODATA);
1088
1089         memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid));
1090         fid_be_to_cpu(pfid, pfid);
1091
1092         RETURN(0);
1093 }
1094
1095 int mdt_attr_get_complex(struct mdt_thread_info *info,
1096                          struct mdt_object *o, struct md_attr *ma)
1097 {
1098         const struct lu_env *env = info->mti_env;
1099         struct md_object    *next = mdt_object_child(o);
1100         struct lu_buf       *buf = &info->mti_buf;
1101         int                  need = ma->ma_need;
1102         int                  rc = 0, rc2;
1103         u32                  mode;
1104         ENTRY;
1105
1106         ma->ma_valid = 0;
1107
1108         if (mdt_object_exists(o) == 0)
1109                 GOTO(out, rc = -ENOENT);
1110         mode = lu_object_attr(&next->mo_lu);
1111
1112         if (need & MA_INODE) {
1113                 ma->ma_need = MA_INODE;
1114                 rc = mo_attr_get(env, next, ma);
1115                 if (rc)
1116                         GOTO(out, rc);
1117
1118                 if (S_ISREG(mode))
1119                         (void) mdt_get_som(info, o, ma);
1120                 ma->ma_valid |= MA_INODE;
1121         }
1122
1123         if (need & MA_PFID) {
1124                 rc = mdt_attr_get_pfid(info, o, &ma->ma_pfid);
1125                 if (rc == 0)
1126                         ma->ma_valid |= MA_PFID;
1127                 /* ignore this error, parent fid is not mandatory */
1128                 rc = 0;
1129         }
1130
1131         if (need & MA_LOV && (S_ISREG(mode) || S_ISDIR(mode))) {
1132                 rc = __mdt_stripe_get(info, o, ma, XATTR_NAME_LOV);
1133                 if (rc)
1134                         GOTO(out, rc);
1135         }
1136
1137         if (need & MA_LMV && S_ISDIR(mode)) {
1138                 rc = __mdt_stripe_get(info, o, ma, XATTR_NAME_LMV);
1139                 if (rc != 0)
1140                         GOTO(out, rc);
1141         }
1142
1143         if (need & MA_LMV_DEF && S_ISDIR(mode)) {
1144                 rc = __mdt_stripe_get(info, o, ma, XATTR_NAME_DEFAULT_LMV);
1145                 if (rc != 0)
1146                         GOTO(out, rc);
1147         }
1148
1149         /*
1150          * In the handle of MA_INODE, we may already get the SOM attr.
1151          */
1152         if (need & MA_SOM && S_ISREG(mode) && !(ma->ma_valid & MA_SOM)) {
1153                 rc = mdt_get_som(info, o, ma);
1154                 if (rc != 0)
1155                         GOTO(out, rc);
1156         }
1157
1158         if (need & MA_HSM && S_ISREG(mode)) {
1159                 buf->lb_buf = info->mti_xattr_buf;
1160                 buf->lb_len = sizeof(info->mti_xattr_buf);
1161                 CLASSERT(sizeof(struct hsm_attrs) <=
1162                          sizeof(info->mti_xattr_buf));
1163                 rc2 = mo_xattr_get(info->mti_env, next, buf, XATTR_NAME_HSM);
1164                 rc2 = lustre_buf2hsm(info->mti_xattr_buf, rc2, &ma->ma_hsm);
1165                 if (rc2 == 0)
1166                         ma->ma_valid |= MA_HSM;
1167                 else if (rc2 < 0 && rc2 != -ENODATA)
1168                         GOTO(out, rc = rc2);
1169         }
1170
1171 #ifdef CONFIG_FS_POSIX_ACL
1172         if (need & MA_ACL_DEF && S_ISDIR(mode)) {
1173                 buf->lb_buf = ma->ma_acl;
1174                 buf->lb_len = ma->ma_acl_size;
1175                 rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_ACL_DEFAULT);
1176                 if (rc2 > 0) {
1177                         ma->ma_acl_size = rc2;
1178                         ma->ma_valid |= MA_ACL_DEF;
1179                 } else if (rc2 == -ENODATA) {
1180                         /* no ACLs */
1181                         ma->ma_acl_size = 0;
1182                 } else
1183                         GOTO(out, rc = rc2);
1184         }
1185 #endif
1186 out:
1187         ma->ma_need = need;
1188         CDEBUG(D_INODE, "after getattr rc = %d, ma_valid = %#llx ma_lmm=%p\n",
1189                rc, ma->ma_valid, ma->ma_lmm);
1190         RETURN(rc);
1191 }
1192
1193 static int mdt_getattr_internal(struct mdt_thread_info *info,
1194                                 struct mdt_object *o, int ma_need)
1195 {
1196         struct md_object        *next = mdt_object_child(o);
1197         const struct mdt_body   *reqbody = info->mti_body;
1198         struct ptlrpc_request   *req = mdt_info_req(info);
1199         struct md_attr          *ma = &info->mti_attr;
1200         struct lu_attr          *la = &ma->ma_attr;
1201         struct req_capsule      *pill = info->mti_pill;
1202         const struct lu_env     *env = info->mti_env;
1203         struct mdt_body         *repbody;
1204         struct lu_buf           *buffer = &info->mti_buf;
1205         struct obd_export       *exp = info->mti_exp;
1206         int                      rc;
1207         ENTRY;
1208
1209         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK))
1210                 RETURN(err_serious(-ENOMEM));
1211
1212         repbody = req_capsule_server_get(pill, &RMF_MDT_BODY);
1213
1214         ma->ma_valid = 0;
1215
1216         if (mdt_object_remote(o)) {
1217                 /* This object is located on remote node.*/
1218                 /* Return -ENOTSUPP for old client */
1219                 if (!mdt_is_dne_client(req->rq_export))
1220                         GOTO(out, rc = -ENOTSUPP);
1221
1222                 repbody->mbo_fid1 = *mdt_object_fid(o);
1223                 repbody->mbo_valid = OBD_MD_FLID | OBD_MD_MDS;
1224                 GOTO(out, rc = 0);
1225         }
1226
1227         if (reqbody->mbo_eadatasize > 0) {
1228                 buffer->lb_buf = req_capsule_server_get(pill, &RMF_MDT_MD);
1229                 if (buffer->lb_buf == NULL)
1230                         GOTO(out, rc = -EPROTO);
1231                 buffer->lb_len = req_capsule_get_size(pill, &RMF_MDT_MD,
1232                                                       RCL_SERVER);
1233         } else {
1234                 buffer->lb_buf = NULL;
1235                 buffer->lb_len = 0;
1236                 ma_need &= ~(MA_LOV | MA_LMV);
1237                 CDEBUG(D_INFO, "%s: RPC from %s: does not need LOVEA.\n",
1238                        mdt_obd_name(info->mti_mdt),
1239                        req->rq_export->exp_client_uuid.uuid);
1240         }
1241
1242         /* If it is dir object and client require MEA, then we got MEA */
1243         if (S_ISDIR(lu_object_attr(&next->mo_lu)) &&
1244             (reqbody->mbo_valid & (OBD_MD_MEA | OBD_MD_DEFAULT_MEA))) {
1245                 /* Assumption: MDT_MD size is enough for lmv size. */
1246                 ma->ma_lmv = buffer->lb_buf;
1247                 ma->ma_lmv_size = buffer->lb_len;
1248                 ma->ma_need = MA_INODE;
1249                 if (ma->ma_lmv_size > 0) {
1250                         if (reqbody->mbo_valid & OBD_MD_MEA)
1251                                 ma->ma_need |= MA_LMV;
1252                         else if (reqbody->mbo_valid & OBD_MD_DEFAULT_MEA)
1253                                 ma->ma_need |= MA_LMV_DEF;
1254                 }
1255         } else {
1256                 ma->ma_lmm = buffer->lb_buf;
1257                 ma->ma_lmm_size = buffer->lb_len;
1258                 ma->ma_need = MA_INODE | MA_HSM;
1259                 if (ma->ma_lmm_size > 0)
1260                         ma->ma_need |= MA_LOV;
1261         }
1262
1263         if (S_ISDIR(lu_object_attr(&next->mo_lu)) &&
1264             reqbody->mbo_valid & OBD_MD_FLDIREA  &&
1265             lustre_msg_get_opc(req->rq_reqmsg) == MDS_GETATTR) {
1266                 /* get default stripe info for this dir. */
1267                 ma->ma_need |= MA_LOV_DEF;
1268         }
1269         ma->ma_need |= ma_need;
1270
1271         rc = mdt_attr_get_complex(info, o, ma);
1272         if (unlikely(rc)) {
1273                 CDEBUG(rc == -ENOENT ? D_OTHER : D_ERROR,
1274                        "%s: getattr error for "DFID": rc = %d\n",
1275                        mdt_obd_name(info->mti_mdt),
1276                        PFID(mdt_object_fid(o)), rc);
1277                 RETURN(rc);
1278         }
1279
1280         /* if file is released, check if a restore is running */
1281         if (ma->ma_valid & MA_HSM) {
1282                 repbody->mbo_valid |= OBD_MD_TSTATE;
1283                 if ((ma->ma_hsm.mh_flags & HS_RELEASED) &&
1284                     mdt_hsm_restore_is_running(info, mdt_object_fid(o)))
1285                         repbody->mbo_t_state = MS_RESTORE;
1286         }
1287
1288         if (likely(ma->ma_valid & MA_INODE))
1289                 mdt_pack_attr2body(info, repbody, la, mdt_object_fid(o));
1290         else
1291                 RETURN(-EFAULT);
1292
1293         if (mdt_body_has_lov(la, reqbody)) {
1294                 if (ma->ma_valid & MA_LOV) {
1295                         LASSERT(ma->ma_lmm_size);
1296                         repbody->mbo_eadatasize = ma->ma_lmm_size;
1297                         if (S_ISDIR(la->la_mode))
1298                                 repbody->mbo_valid |= OBD_MD_FLDIREA;
1299                         else
1300                                 repbody->mbo_valid |= OBD_MD_FLEASIZE;
1301                         mdt_dump_lmm(D_INFO, ma->ma_lmm, repbody->mbo_valid);
1302                 }
1303                 if (ma->ma_valid & MA_LMV) {
1304                         /* Return -ENOTSUPP for old client */
1305                         if (!mdt_is_striped_client(req->rq_export))
1306                                 RETURN(-ENOTSUPP);
1307
1308                         LASSERT(S_ISDIR(la->la_mode));
1309                         mdt_dump_lmv(D_INFO, ma->ma_lmv);
1310                         repbody->mbo_eadatasize = ma->ma_lmv_size;
1311                         repbody->mbo_valid |= (OBD_MD_FLDIREA|OBD_MD_MEA);
1312                 }
1313                 if (ma->ma_valid & MA_LMV_DEF) {
1314                         /* Return -ENOTSUPP for old client */
1315                         if (!mdt_is_striped_client(req->rq_export))
1316                                 RETURN(-ENOTSUPP);
1317                         LASSERT(S_ISDIR(la->la_mode));
1318                         mdt_dump_lmv(D_INFO, ma->ma_lmv);
1319                         repbody->mbo_eadatasize = ma->ma_lmv_size;
1320                         repbody->mbo_valid |= (OBD_MD_FLDIREA |
1321                                                OBD_MD_DEFAULT_MEA);
1322                 }
1323         } else if (S_ISLNK(la->la_mode) &&
1324                    reqbody->mbo_valid & OBD_MD_LINKNAME) {
1325                 buffer->lb_buf = ma->ma_lmm;
1326                 /* eadatasize from client includes NULL-terminator, so
1327                  * there is no need to read it */
1328                 buffer->lb_len = reqbody->mbo_eadatasize - 1;
1329                 rc = mo_readlink(env, next, buffer);
1330                 if (unlikely(rc <= 0)) {
1331                         CERROR("%s: readlink failed for "DFID": rc = %d\n",
1332                                mdt_obd_name(info->mti_mdt),
1333                                PFID(mdt_object_fid(o)), rc);
1334                         rc = -EFAULT;
1335                 } else {
1336                         int print_limit = min_t(int, PAGE_SIZE - 128, rc);
1337
1338                         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_READLINK_EPROTO))
1339                                 rc -= 2;
1340                         repbody->mbo_valid |= OBD_MD_LINKNAME;
1341                         /* we need to report back size with NULL-terminator
1342                          * because client expects that */
1343                         repbody->mbo_eadatasize = rc + 1;
1344                         if (repbody->mbo_eadatasize != reqbody->mbo_eadatasize)
1345                                 CDEBUG(D_INODE, "%s: Read shorter symlink %d "
1346                                        "on "DFID ", expected %d\n",
1347                                        mdt_obd_name(info->mti_mdt),
1348                                        rc, PFID(mdt_object_fid(o)),
1349                                        reqbody->mbo_eadatasize - 1);
1350                         /* NULL terminate */
1351                         ((char *)ma->ma_lmm)[rc] = 0;
1352
1353                         /* If the total CDEBUG() size is larger than a page, it
1354                          * will print a warning to the console, avoid this by
1355                          * printing just the last part of the symlink. */
1356                         CDEBUG(D_INODE, "symlink dest %s%.*s, len = %d\n",
1357                                print_limit < rc ? "..." : "", print_limit,
1358                                (char *)ma->ma_lmm + rc - print_limit, rc);
1359                         rc = 0;
1360                 }
1361         }
1362
1363         if (reqbody->mbo_valid & OBD_MD_FLMODEASIZE) {
1364                 repbody->mbo_max_mdsize = info->mti_mdt->mdt_max_mdsize;
1365                 repbody->mbo_valid |= OBD_MD_FLMODEASIZE;
1366                 CDEBUG(D_INODE, "changing the max MD size to %u\n",
1367                        repbody->mbo_max_mdsize);
1368         }
1369
1370 #ifdef CONFIG_FS_POSIX_ACL
1371         if ((exp_connect_flags(req->rq_export) & OBD_CONNECT_ACL) &&
1372                  (reqbody->mbo_valid & OBD_MD_FLACL)) {
1373                 struct lu_nodemap *nodemap = nodemap_get_from_exp(exp);
1374                 if (IS_ERR(nodemap))
1375                         RETURN(PTR_ERR(nodemap));
1376
1377                 rc = mdt_pack_acl2body(info, repbody, o, nodemap);
1378                 nodemap_putref(nodemap);
1379         }
1380 #endif
1381
1382 out:
1383         if (rc == 0)
1384                 mdt_counter_incr(req, LPROC_MDT_GETATTR);
1385
1386         RETURN(rc);
1387 }
1388
1389 static int mdt_getattr(struct tgt_session_info *tsi)
1390 {
1391         struct mdt_thread_info  *info = tsi2mdt_info(tsi);
1392         struct mdt_object       *obj = info->mti_object;
1393         struct req_capsule      *pill = info->mti_pill;
1394         struct mdt_body         *reqbody;
1395         struct mdt_body         *repbody;
1396         int rc, rc2;
1397         ENTRY;
1398
1399         if (unlikely(info->mti_object == NULL))
1400                 RETURN(-EPROTO);
1401
1402         reqbody = req_capsule_client_get(pill, &RMF_MDT_BODY);
1403         LASSERT(reqbody);
1404         LASSERT(lu_object_assert_exists(&obj->mot_obj));
1405
1406         /* Special case for Data-on-MDT files to get data version */
1407         if (unlikely(reqbody->mbo_valid & OBD_MD_FLDATAVERSION)) {
1408                 rc = mdt_data_version_get(tsi);
1409                 GOTO(out, rc);
1410         }
1411
1412         /* Unlike intent case where we need to pre-fill out buffers early on
1413          * in intent policy for ldlm reasons, here we can have a much better
1414          * guess at EA size by just reading it from disk.
1415          * Exceptions are readdir and (missing) directory striping */
1416         /* Readlink */
1417         if (reqbody->mbo_valid & OBD_MD_LINKNAME) {
1418                 /* No easy way to know how long is the symlink, but it cannot
1419                  * be more than PATH_MAX, so we allocate +1 */
1420                 rc = PATH_MAX + 1;
1421         /* A special case for fs ROOT: getattr there might fetch
1422          * default EA for entire fs, not just for this dir!
1423          */
1424         } else if (lu_fid_eq(mdt_object_fid(obj),
1425                              &info->mti_mdt->mdt_md_root_fid) &&
1426                    (reqbody->mbo_valid & OBD_MD_FLDIREA) &&
1427                    (lustre_msg_get_opc(mdt_info_req(info)->rq_reqmsg) ==
1428                                                                  MDS_GETATTR)) {
1429                 /* Should the default strping be bigger, mdt_fix_reply
1430                  * will reallocate */
1431                 rc = DEF_REP_MD_SIZE;
1432         } else {
1433                 /* Read the actual EA size from disk */
1434                 rc = mdt_attr_get_eabuf_size(info, obj);
1435         }
1436
1437         if (rc < 0)
1438                 GOTO(out, rc = err_serious(rc));
1439
1440         req_capsule_set_size(pill, &RMF_MDT_MD, RCL_SERVER, rc);
1441
1442         /* Set ACL reply buffer size as LUSTRE_POSIX_ACL_MAX_SIZE_OLD
1443          * by default. If the target object has more ACL entries, then
1444          * enlarge the buffer when necessary. */
1445         req_capsule_set_size(pill, &RMF_ACL, RCL_SERVER,
1446                              LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
1447
1448         rc = req_capsule_server_pack(pill);
1449         if (unlikely(rc != 0))
1450                 GOTO(out, rc = err_serious(rc));
1451
1452         repbody = req_capsule_server_get(pill, &RMF_MDT_BODY);
1453         LASSERT(repbody != NULL);
1454         repbody->mbo_eadatasize = 0;
1455         repbody->mbo_aclsize = 0;
1456
1457         rc = mdt_check_ucred(info);
1458         if (unlikely(rc))
1459                 GOTO(out_shrink, rc);
1460
1461         info->mti_cross_ref = !!(reqbody->mbo_valid & OBD_MD_FLCROSSREF);
1462
1463         rc = mdt_getattr_internal(info, obj, 0);
1464         EXIT;
1465 out_shrink:
1466         mdt_client_compatibility(info);
1467         rc2 = mdt_fix_reply(info);
1468         if (rc == 0)
1469                 rc = rc2;
1470 out:
1471         mdt_thread_info_fini(info);
1472         return rc;
1473 }
1474
1475 /**
1476  * Handler of layout intent RPC requiring the layout modification
1477  *
1478  * \param[in]  info     thread environment
1479  * \param[in]  obj      object
1480  * \param[out] lhc      object ldlm lock handle
1481  * \param[in]  layout   layout change descriptor
1482  *
1483  * \retval 0    on success
1484  * \retval < 0  error code
1485  */
1486 int mdt_layout_change(struct mdt_thread_info *info, struct mdt_object *obj,
1487                       struct mdt_lock_handle *lhc,
1488                       struct md_layout_change *layout)
1489 {
1490         int rc;
1491
1492         ENTRY;
1493
1494         if (!mdt_object_exists(obj))
1495                 RETURN(-ENOENT);
1496
1497         if (!S_ISREG(lu_object_attr(&obj->mot_obj)))
1498                 RETURN(-EINVAL);
1499
1500         rc = mo_permission(info->mti_env, NULL, mdt_object_child(obj), NULL,
1501                            MAY_WRITE);
1502         if (rc)
1503                 RETURN(rc);
1504
1505         rc = mdt_check_resent_lock(info, obj, lhc);
1506         if (rc < 0)
1507                 RETURN(rc);
1508
1509         if (rc > 0) {
1510                 /* not resent */
1511                 mdt_lock_handle_init(lhc);
1512                 mdt_lock_reg_init(lhc, LCK_EX);
1513                 rc = mdt_reint_object_lock(info, obj, lhc, MDS_INODELOCK_LAYOUT,
1514                                            false);
1515                 if (rc)
1516                         RETURN(rc);
1517         }
1518
1519         mutex_lock(&obj->mot_som_mutex);
1520         rc = mo_layout_change(info->mti_env, mdt_object_child(obj), layout);
1521         mutex_unlock(&obj->mot_som_mutex);
1522
1523         if (rc)
1524                 mdt_object_unlock(info, obj, lhc, 1);
1525
1526         RETURN(rc);
1527 }
1528
1529 /**
1530  * Exchange MOF_LOV_CREATED flags between two objects after a
1531  * layout swap. No assumption is made on whether o1 or o2 have
1532  * created objects or not.
1533  *
1534  * \param[in,out] o1    First swap layout object
1535  * \param[in,out] o2    Second swap layout object
1536  */
1537 static void mdt_swap_lov_flag(struct mdt_object *o1, struct mdt_object *o2)
1538 {
1539         unsigned int o1_lov_created = o1->mot_lov_created;
1540
1541         mutex_lock(&o1->mot_lov_mutex);
1542         mutex_lock(&o2->mot_lov_mutex);
1543
1544         o1->mot_lov_created = o2->mot_lov_created;
1545         o2->mot_lov_created = o1_lov_created;
1546
1547         mutex_unlock(&o2->mot_lov_mutex);
1548         mutex_unlock(&o1->mot_lov_mutex);
1549 }
1550
1551 static int mdt_swap_layouts(struct tgt_session_info *tsi)
1552 {
1553         struct mdt_thread_info  *info;
1554         struct ptlrpc_request   *req = tgt_ses_req(tsi);
1555         struct obd_export       *exp = req->rq_export;
1556         struct mdt_object       *o1, *o2, *o;
1557         struct mdt_lock_handle  *lh1, *lh2;
1558         struct mdc_swap_layouts *msl;
1559         int                      rc;
1560         ENTRY;
1561
1562         /* client does not support layout lock, so layout swaping
1563          * is disabled.
1564          * FIXME: there is a problem for old clients which don't support
1565          * layout lock yet. If those clients have already opened the file
1566          * they won't be notified at all so that old layout may still be
1567          * used to do IO. This can be fixed after file release is landed by
1568          * doing exclusive open and taking full EX ibits lock. - Jinshan */
1569         if (!exp_connect_layout(exp))
1570                 RETURN(-EOPNOTSUPP);
1571
1572         info = tsi2mdt_info(tsi);
1573         if (unlikely(info->mti_object == NULL))
1574                 RETURN(-EPROTO);
1575
1576         if (info->mti_dlm_req != NULL)
1577                 ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
1578
1579         o1 = info->mti_object;
1580         o = o2 = mdt_object_find(info->mti_env, info->mti_mdt,
1581                                 &info->mti_body->mbo_fid2);
1582         if (IS_ERR(o))
1583                 GOTO(out, rc = PTR_ERR(o));
1584
1585         if (mdt_object_remote(o) || !mdt_object_exists(o)) /* remote object */
1586                 GOTO(put, rc = -ENOENT);
1587
1588         rc = lu_fid_cmp(&info->mti_body->mbo_fid1, &info->mti_body->mbo_fid2);
1589         if (unlikely(rc == 0)) /* same file, you kidding me? no-op. */
1590                 GOTO(put, rc);
1591
1592         if (rc < 0)
1593                 swap(o1, o2);
1594
1595         /* permission check. Make sure the calling process having permission
1596          * to write both files. */
1597         rc = mo_permission(info->mti_env, NULL, mdt_object_child(o1), NULL,
1598                            MAY_WRITE);
1599         if (rc < 0)
1600                 GOTO(put, rc);
1601
1602         rc = mo_permission(info->mti_env, NULL, mdt_object_child(o2), NULL,
1603                            MAY_WRITE);
1604         if (rc < 0)
1605                 GOTO(put, rc);
1606
1607         msl = req_capsule_client_get(info->mti_pill, &RMF_SWAP_LAYOUTS);
1608         if (msl == NULL)
1609                 GOTO(put, rc = -EPROTO);
1610
1611         lh1 = &info->mti_lh[MDT_LH_NEW];
1612         mdt_lock_reg_init(lh1, LCK_EX);
1613         lh2 = &info->mti_lh[MDT_LH_OLD];
1614         mdt_lock_reg_init(lh2, LCK_EX);
1615
1616         rc = mdt_object_lock(info, o1, lh1, MDS_INODELOCK_LAYOUT |
1617                              MDS_INODELOCK_XATTR);
1618         if (rc < 0)
1619                 GOTO(put, rc);
1620
1621         rc = mdt_object_lock(info, o2, lh2, MDS_INODELOCK_LAYOUT |
1622                              MDS_INODELOCK_XATTR);
1623         if (rc < 0)
1624                 GOTO(unlock1, rc);
1625
1626         rc = mo_swap_layouts(info->mti_env, mdt_object_child(o1),
1627                              mdt_object_child(o2), msl->msl_flags);
1628         if (rc < 0)
1629                 GOTO(unlock2, rc);
1630
1631         mdt_swap_lov_flag(o1, o2);
1632
1633 unlock2:
1634         mdt_object_unlock(info, o2, lh2, rc);
1635 unlock1:
1636         mdt_object_unlock(info, o1, lh1, rc);
1637 put:
1638         mdt_object_put(info->mti_env, o);
1639 out:
1640         mdt_thread_info_fini(info);
1641         RETURN(rc);
1642 }
1643
1644 static int mdt_raw_lookup(struct mdt_thread_info *info,
1645                           struct mdt_object *parent,
1646                           const struct lu_name *lname,
1647                           struct ldlm_reply *ldlm_rep)
1648 {
1649         struct lu_fid   *child_fid = &info->mti_tmp_fid1;
1650         int              rc;
1651         ENTRY;
1652
1653         LASSERT(!info->mti_cross_ref);
1654
1655         /* Only got the fid of this obj by name */
1656         fid_zero(child_fid);
1657         rc = mdo_lookup(info->mti_env, mdt_object_child(info->mti_object),
1658                         lname, child_fid, &info->mti_spec);
1659         if (rc == 0) {
1660                 struct mdt_body *repbody;
1661
1662                 repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
1663                 repbody->mbo_fid1 = *child_fid;
1664                 repbody->mbo_valid = OBD_MD_FLID;
1665                 mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_POS);
1666         } else if (rc == -ENOENT) {
1667                 mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_NEG);
1668         }
1669
1670         RETURN(rc);
1671 }
1672
1673 /*
1674  * UPDATE lock should be taken against parent, and be released before exit;
1675  * child_bits lock should be taken against child, and be returned back:
1676  *            (1)normal request should release the child lock;
1677  *            (2)intent request will grant the lock to client.
1678  */
1679 static int mdt_getattr_name_lock(struct mdt_thread_info *info,
1680                                  struct mdt_lock_handle *lhc,
1681                                  __u64 child_bits,
1682                                  struct ldlm_reply *ldlm_rep)
1683 {
1684         struct ptlrpc_request *req = mdt_info_req(info);
1685         struct mdt_body *reqbody = NULL;
1686         struct mdt_object *parent = info->mti_object;
1687         struct mdt_object *child = NULL;
1688         struct lu_fid *child_fid = &info->mti_tmp_fid1;
1689         struct lu_name *lname = NULL;
1690         struct mdt_lock_handle *lhp = NULL;
1691         struct ldlm_lock *lock;
1692         __u64 try_bits = 0;
1693         bool is_resent;
1694         int ma_need = 0;
1695         int rc;
1696
1697         ENTRY;
1698
1699         is_resent = lustre_handle_is_used(&lhc->mlh_reg_lh);
1700         LASSERT(ergo(is_resent,
1701                      lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT));
1702
1703         if (parent == NULL)
1704                 RETURN(-ENOENT);
1705
1706         if (info->mti_cross_ref) {
1707                 /* Only getattr on the child. Parent is on another node. */
1708                 mdt_set_disposition(info, ldlm_rep,
1709                                     DISP_LOOKUP_EXECD | DISP_LOOKUP_POS);
1710                 child = parent;
1711                 CDEBUG(D_INODE, "partial getattr_name child_fid = "DFID", "
1712                        "ldlm_rep = %p\n",
1713                        PFID(mdt_object_fid(child)), ldlm_rep);
1714
1715                 rc = mdt_check_resent_lock(info, child, lhc);
1716                 if (rc < 0) {
1717                         RETURN(rc);
1718                 } else if (rc > 0) {
1719                         mdt_lock_handle_init(lhc);
1720                         mdt_lock_reg_init(lhc, LCK_PR);
1721
1722                         /*
1723                          * Object's name entry is on another MDS, it will
1724                          * request PERM lock only because LOOKUP lock is owned
1725                          * by the MDS where name entry resides.
1726                          *
1727                          * TODO: it should try layout lock too. - Jinshan
1728                          */
1729                         child_bits &= ~(MDS_INODELOCK_LOOKUP |
1730                                         MDS_INODELOCK_LAYOUT);
1731                         child_bits |= MDS_INODELOCK_PERM;
1732
1733                         rc = mdt_object_lock(info, child, lhc, child_bits);
1734                         if (rc < 0)
1735                                 RETURN(rc);
1736                 }
1737
1738                 /* Finally, we can get attr for child. */
1739                 if (!mdt_object_exists(child)) {
1740                         LU_OBJECT_DEBUG(D_INFO, info->mti_env,
1741                                         &child->mot_obj,
1742                                         "remote object doesn't exist.");
1743                         mdt_object_unlock(info, child, lhc, 1);
1744                         RETURN(-ENOENT);
1745                 }
1746
1747                 rc = mdt_getattr_internal(info, child, 0);
1748                 if (unlikely(rc != 0))
1749                         mdt_object_unlock(info, child, lhc, 1);
1750
1751                 mdt_pack_secctx_in_reply(info, child);
1752
1753                 RETURN(rc);
1754         }
1755
1756         lname = &info->mti_name;
1757         mdt_name_unpack(info->mti_pill, &RMF_NAME, lname, MNF_FIX_ANON);
1758
1759         if (lu_name_is_valid(lname)) {
1760                 if (mdt_object_remote(parent)) {
1761                         CERROR("%s: parent "DFID" is on remote target\n",
1762                                mdt_obd_name(info->mti_mdt),
1763                                PFID(mdt_object_fid(parent)));
1764                         RETURN(-EPROTO);
1765                 }
1766
1767                 CDEBUG(D_INODE, "getattr with lock for "DFID"/"DNAME", "
1768                        "ldlm_rep = %p\n", PFID(mdt_object_fid(parent)),
1769                        PNAME(lname), ldlm_rep);
1770         } else {
1771                 reqbody = req_capsule_client_get(info->mti_pill, &RMF_MDT_BODY);
1772                 if (unlikely(reqbody == NULL))
1773                         RETURN(err_serious(-EPROTO));
1774
1775                 *child_fid = reqbody->mbo_fid2;
1776                 if (unlikely(!fid_is_sane(child_fid)))
1777                         RETURN(err_serious(-EINVAL));
1778
1779                 if (lu_fid_eq(mdt_object_fid(parent), child_fid)) {
1780                         mdt_object_get(info->mti_env, parent);
1781                         child = parent;
1782                 } else {
1783                         child = mdt_object_find(info->mti_env, info->mti_mdt,
1784                                                 child_fid);
1785                         if (IS_ERR(child))
1786                                 RETURN(PTR_ERR(child));
1787                 }
1788
1789                 if (mdt_object_remote(child)) {
1790                         CERROR("%s: child "DFID" is on remote target\n",
1791                                mdt_obd_name(info->mti_mdt),
1792                                PFID(mdt_object_fid(child)));
1793                         GOTO(out_child, rc = -EPROTO);
1794                 }
1795
1796                 /* don't fetch LOOKUP lock if it's remote object */
1797                 rc = mdt_is_remote_object(info, parent, child);
1798                 if (rc < 0)
1799                         GOTO(out_child, rc);
1800                 if (rc)
1801                         child_bits &= ~MDS_INODELOCK_LOOKUP;
1802
1803                 CDEBUG(D_INODE, "getattr with lock for "DFID"/"DFID", "
1804                        "ldlm_rep = %p\n",
1805                        PFID(mdt_object_fid(parent)),
1806                        PFID(&reqbody->mbo_fid2), ldlm_rep);
1807         }
1808
1809         mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_EXECD);
1810
1811         if (unlikely(!mdt_object_exists(parent)) && lu_name_is_valid(lname)) {
1812                 LU_OBJECT_DEBUG(D_INODE, info->mti_env,
1813                                 &parent->mot_obj,
1814                                 "Parent doesn't exist!");
1815                 GOTO(out_child, rc = -ESTALE);
1816         }
1817
1818         if (lu_name_is_valid(lname)) {
1819                 /* Always allow to lookup ".." */
1820                 if (unlikely(lname->ln_namelen == 2 &&
1821                              lname->ln_name[0] == '.' &&
1822                              lname->ln_name[1] == '.'))
1823                         info->mti_spec.sp_permitted = 1;
1824
1825                 if (info->mti_body->mbo_valid == OBD_MD_FLID) {
1826                         rc = mdt_raw_lookup(info, parent, lname, ldlm_rep);
1827
1828                         RETURN(rc);
1829                 }
1830
1831                 /* step 1: lock parent only if parent is a directory */
1832                 if (S_ISDIR(lu_object_attr(&parent->mot_obj))) {
1833                         lhp = &info->mti_lh[MDT_LH_PARENT];
1834                         mdt_lock_pdo_init(lhp, LCK_PR, lname);
1835                         rc = mdt_object_lock(info, parent, lhp,
1836                                              MDS_INODELOCK_UPDATE);
1837                         if (unlikely(rc != 0))
1838                                 RETURN(rc);
1839                 }
1840
1841                 /* step 2: lookup child's fid by name */
1842                 fid_zero(child_fid);
1843                 rc = mdo_lookup(info->mti_env, mdt_object_child(parent), lname,
1844                                 child_fid, &info->mti_spec);
1845                 if (rc == -ENOENT)
1846                         mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_NEG);
1847
1848                 if (rc != 0)
1849                         GOTO(unlock_parent, rc);
1850
1851                 child = mdt_object_find(info->mti_env, info->mti_mdt,
1852                                         child_fid);
1853                 if (unlikely(IS_ERR(child)))
1854                         GOTO(unlock_parent, rc = PTR_ERR(child));
1855         }
1856
1857         mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_POS);
1858
1859         /* step 3: lock child regardless if it is local or remote. */
1860         LASSERT(child);
1861
1862         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_RESEND, obd_timeout * 2);
1863         if (!mdt_object_exists(child)) {
1864                 LU_OBJECT_DEBUG(D_INODE, info->mti_env,
1865                                 &child->mot_obj,
1866                                 "Object doesn't exist!");
1867                 GOTO(out_child, rc = -ENOENT);
1868         }
1869
1870         rc = mdt_check_resent_lock(info, child, lhc);
1871         if (rc < 0) {
1872                 GOTO(out_child, rc);
1873         } else if (rc > 0) {
1874                 mdt_lock_handle_init(lhc);
1875                 mdt_lock_reg_init(lhc, LCK_PR);
1876
1877                 if (!(child_bits & MDS_INODELOCK_UPDATE) &&
1878                       mdt_object_exists(child) && !mdt_object_remote(child)) {
1879                         struct md_attr *ma = &info->mti_attr;
1880
1881                         ma->ma_valid = 0;
1882                         ma->ma_need = MA_INODE;
1883                         rc = mdt_attr_get_complex(info, child, ma);
1884                         if (unlikely(rc != 0))
1885                                 GOTO(out_child, rc);
1886
1887                         /* If the file has not been changed for some time, we
1888                          * return not only a LOOKUP lock, but also an UPDATE
1889                          * lock and this might save us RPC on later STAT. For
1890                          * directories, it also let negative dentry cache start
1891                          * working for this dir. */
1892                         if (ma->ma_valid & MA_INODE &&
1893                             ma->ma_attr.la_valid & LA_CTIME &&
1894                             info->mti_mdt->mdt_namespace->ns_ctime_age_limit +
1895                                 ma->ma_attr.la_ctime < ktime_get_real_seconds())
1896                                 child_bits |= MDS_INODELOCK_UPDATE;
1897                 }
1898
1899                 /* layout lock must be granted in a best-effort way
1900                  * for IT operations */
1901                 LASSERT(!(child_bits & MDS_INODELOCK_LAYOUT));
1902                 if (S_ISREG(lu_object_attr(&child->mot_obj)) &&
1903                     !mdt_object_remote(child) && ldlm_rep != NULL) {
1904                         if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_NO_LL_GETATTR) &&
1905                             exp_connect_layout(info->mti_exp)) {
1906                                 /* try to grant layout lock for regular file. */
1907                                 try_bits = MDS_INODELOCK_LAYOUT;
1908                         }
1909                         /* Acquire DOM lock in advance for data-on-mdt file */
1910                         if (child != parent)
1911                                 try_bits |= MDS_INODELOCK_DOM;
1912                 }
1913
1914                 if (try_bits != 0) {
1915                         /* try layout lock, it may fail to be granted due to
1916                          * contention at LOOKUP or UPDATE */
1917                         rc = mdt_object_lock_try(info, child, lhc, &child_bits,
1918                                                  try_bits, false);
1919                         if (child_bits & MDS_INODELOCK_LAYOUT)
1920                                 ma_need |= MA_LOV;
1921                 } else {
1922                         /* Do not enqueue the UPDATE lock from MDT(cross-MDT),
1923                          * client will enqueue the lock to the remote MDT */
1924                         if (mdt_object_remote(child))
1925                                 child_bits &= ~MDS_INODELOCK_UPDATE;
1926                         rc = mdt_object_lock(info, child, lhc, child_bits);
1927                 }
1928                 if (unlikely(rc != 0))
1929                         GOTO(out_child, rc);
1930         }
1931
1932         /* finally, we can get attr for child. */
1933         rc = mdt_getattr_internal(info, child, ma_need);
1934         if (unlikely(rc != 0)) {
1935                 mdt_object_unlock(info, child, lhc, 1);
1936                 GOTO(out_child, rc);
1937         }
1938
1939         mdt_pack_secctx_in_reply(info, child);
1940
1941         lock = ldlm_handle2lock(&lhc->mlh_reg_lh);
1942         if (lock) {
1943                 /* Debugging code. */
1944                 LDLM_DEBUG(lock, "Returning lock to client");
1945                 LASSERTF(fid_res_name_eq(mdt_object_fid(child),
1946                                          &lock->l_resource->lr_name),
1947                          "Lock res_id: "DLDLMRES", fid: "DFID"\n",
1948                          PLDLMRES(lock->l_resource),
1949                          PFID(mdt_object_fid(child)));
1950
1951                 if (mdt_object_exists(child) &&
1952                     S_ISREG(lu_object_attr(&child->mot_obj)) &&
1953                     !mdt_object_remote(child) && child != parent) {
1954                         mdt_object_put(info->mti_env, child);
1955                         rc = mdt_pack_size2body(info, child_fid,
1956                                                 &lhc->mlh_reg_lh);
1957                         if (rc && child_bits & MDS_INODELOCK_DOM) {
1958                                 /* DOM lock was taken in advance but this is
1959                                  * not DoM file. Drop the lock.
1960                                  */
1961                                 lock_res_and_lock(lock);
1962                                 ldlm_inodebits_drop(lock, MDS_INODELOCK_DOM);
1963                                 unlock_res_and_lock(lock);
1964                         }
1965                         LDLM_LOCK_PUT(lock);
1966                         GOTO(unlock_parent, rc = 0);
1967                 }
1968                 LDLM_LOCK_PUT(lock);
1969         }
1970
1971         EXIT;
1972 out_child:
1973         if (child)
1974                 mdt_object_put(info->mti_env, child);
1975 unlock_parent:
1976         if (lhp)
1977                 mdt_object_unlock(info, parent, lhp, 1);
1978         return rc;
1979 }
1980
1981 /* normal handler: should release the child lock */
1982 static int mdt_getattr_name(struct tgt_session_info *tsi)
1983 {
1984         struct mdt_thread_info  *info = tsi2mdt_info(tsi);
1985         struct mdt_lock_handle *lhc = &info->mti_lh[MDT_LH_CHILD];
1986         struct mdt_body        *reqbody;
1987         struct mdt_body        *repbody;
1988         int rc, rc2;
1989         ENTRY;
1990
1991         reqbody = req_capsule_client_get(info->mti_pill, &RMF_MDT_BODY);
1992         LASSERT(reqbody != NULL);
1993         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
1994         LASSERT(repbody != NULL);
1995
1996         info->mti_cross_ref = !!(reqbody->mbo_valid & OBD_MD_FLCROSSREF);
1997         repbody->mbo_eadatasize = 0;
1998         repbody->mbo_aclsize = 0;
1999
2000         rc = mdt_init_ucred_intent_getattr(info, reqbody);
2001         if (unlikely(rc))
2002                 GOTO(out_shrink, rc);
2003
2004         rc = mdt_getattr_name_lock(info, lhc, MDS_INODELOCK_UPDATE, NULL);
2005         if (lustre_handle_is_used(&lhc->mlh_reg_lh)) {
2006                 ldlm_lock_decref(&lhc->mlh_reg_lh, lhc->mlh_reg_mode);
2007                 lhc->mlh_reg_lh.cookie = 0;
2008         }
2009         mdt_exit_ucred(info);
2010         EXIT;
2011 out_shrink:
2012         mdt_client_compatibility(info);
2013         rc2 = mdt_fix_reply(info);
2014         if (rc == 0)
2015                 rc = rc2;
2016         mdt_thread_info_fini(info);
2017         return rc;
2018 }
2019
2020 static int mdt_rmfid_unlink(struct mdt_thread_info *info,
2021                             const struct lu_fid *pfid,
2022                             const struct lu_name *name,
2023                             struct mdt_object *obj, s64 ctime)
2024 {
2025         struct lu_fid *child_fid = &info->mti_tmp_fid1;
2026         struct ldlm_enqueue_info *einfo = &info->mti_einfo[0];
2027         struct mdt_device *mdt = info->mti_mdt;
2028         struct md_attr *ma = &info->mti_attr;
2029         struct mdt_lock_handle *parent_lh;
2030         struct mdt_lock_handle *child_lh;
2031         struct mdt_object *pobj;
2032         bool cos_incompat = false;
2033         int rc;
2034         ENTRY;
2035
2036         pobj = mdt_object_find(info->mti_env, mdt, pfid);
2037         if (IS_ERR(pobj))
2038                 GOTO(out, rc = PTR_ERR(pobj));
2039
2040         parent_lh = &info->mti_lh[MDT_LH_PARENT];
2041         mdt_lock_pdo_init(parent_lh, LCK_PW, name);
2042         rc = mdt_object_lock(info, pobj, parent_lh, MDS_INODELOCK_UPDATE);
2043         if (rc != 0)
2044                 GOTO(put_parent, rc);
2045
2046         if (mdt_object_remote(pobj))
2047                 cos_incompat = true;
2048
2049         rc = mdo_lookup(info->mti_env, mdt_object_child(pobj),
2050                         name, child_fid, &info->mti_spec);
2051         if (rc != 0)
2052                 GOTO(unlock_parent, rc);
2053
2054         if (!lu_fid_eq(child_fid, mdt_object_fid(obj)))
2055                 GOTO(unlock_parent, rc = -EREMCHG);
2056
2057         child_lh = &info->mti_lh[MDT_LH_CHILD];
2058         mdt_lock_reg_init(child_lh, LCK_EX);
2059         rc = mdt_reint_striped_lock(info, obj, child_lh,
2060                                     MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE,
2061                                     einfo, cos_incompat);
2062         if (rc != 0)
2063                 GOTO(unlock_parent, rc);
2064
2065         if (atomic_read(&obj->mot_open_count)) {
2066                 CDEBUG(D_OTHER, "object "DFID" open, skip\n",
2067                        PFID(mdt_object_fid(obj)));
2068                 GOTO(unlock_child, rc = -EBUSY);
2069         }
2070
2071         ma->ma_need = 0;
2072         ma->ma_valid = MA_INODE;
2073         ma->ma_attr.la_valid = LA_CTIME;
2074         ma->ma_attr.la_ctime = ctime;
2075
2076         mutex_lock(&obj->mot_lov_mutex);
2077
2078         rc = mdo_unlink(info->mti_env, mdt_object_child(pobj),
2079                         mdt_object_child(obj), name, ma, 0);
2080
2081         mutex_unlock(&obj->mot_lov_mutex);
2082
2083 unlock_child:
2084         mdt_reint_striped_unlock(info, obj, child_lh, einfo, 1);
2085 unlock_parent:
2086         mdt_object_unlock(info, pobj, parent_lh, 1);
2087 put_parent:
2088         mdt_object_put(info->mti_env, pobj);
2089 out:
2090         RETURN(rc);
2091 }
2092
2093 static int mdt_rmfid_check_permission(struct mdt_thread_info *info,
2094                                         struct mdt_object *obj)
2095 {
2096         struct lu_ucred *uc = lu_ucred(info->mti_env);
2097         struct md_attr *ma = &info->mti_attr;
2098         struct lu_attr *la = &ma->ma_attr;
2099         int rc = 0;
2100         ENTRY;
2101
2102         ma->ma_need = MA_INODE;
2103         rc = mo_attr_get(info->mti_env, mdt_object_child(obj), ma);
2104         if (rc)
2105                 GOTO(out, rc);
2106
2107         if (la->la_flags & LUSTRE_IMMUTABLE_FL)
2108                         rc = -EACCES;
2109
2110         if (md_capable(uc, CFS_CAP_DAC_OVERRIDE))
2111                 RETURN(0);
2112         if (uc->uc_fsuid == la->la_uid) {
2113                 if ((la->la_mode & S_IWUSR) == 0)
2114                         rc = -EACCES;
2115         } else if (uc->uc_fsgid == la->la_gid) {
2116                 if ((la->la_mode & S_IWGRP) == 0)
2117                         rc = -EACCES;
2118         } else if ((la->la_mode & S_IWOTH) == 0) {
2119                         rc = -EACCES;
2120         }
2121
2122 out:
2123         RETURN(rc);
2124 }
2125
2126 static int mdt_rmfid_one(struct mdt_thread_info *info, struct lu_fid *fid,
2127                          s64 ctime)
2128 {
2129         struct mdt_device *mdt = info->mti_mdt;
2130         struct mdt_object *obj = NULL;
2131         struct linkea_data ldata = { NULL };
2132         struct lu_buf *buf = &info->mti_big_buf;
2133         struct lu_name *name = &info->mti_name;
2134         struct lu_fid *pfid = &info->mti_tmp_fid1;
2135         struct link_ea_header *leh;
2136         struct link_ea_entry *lee;
2137         int reclen, count, rc = 0;
2138         ENTRY;
2139
2140         if (!fid_is_sane(fid))
2141                 GOTO(out, rc = -EINVAL);
2142
2143         if (!fid_is_namespace_visible(fid))
2144                 GOTO(out, rc = -EINVAL);
2145
2146         obj = mdt_object_find(info->mti_env, mdt, fid);
2147         if (IS_ERR(obj))
2148                 GOTO(out, rc = PTR_ERR(obj));
2149
2150         if (mdt_object_remote(obj))
2151                 GOTO(out, rc = -EREMOTE);
2152         if (!mdt_object_exists(obj) || lu_object_is_dying(&obj->mot_header))
2153                 GOTO(out, rc = -ENOENT);
2154
2155         rc = mdt_rmfid_check_permission(info, obj);
2156         if (rc)
2157                 GOTO(out, rc);
2158
2159         /* take LinkEA */
2160         buf = lu_buf_check_and_alloc(buf, PATH_MAX);
2161         if (!buf->lb_buf)
2162                 GOTO(out, rc = -ENOMEM);
2163
2164         ldata.ld_buf = buf;
2165         rc = mdt_links_read(info, obj, &ldata);
2166         if (rc)
2167                 GOTO(out, rc);
2168
2169         leh = buf->lb_buf;
2170         lee = (struct link_ea_entry *)(leh + 1);
2171         for (count = 0; count < leh->leh_reccount; count++) {
2172                 /* remove every hardlink */
2173                 linkea_entry_unpack(lee, &reclen, name, pfid);
2174                 lee = (struct link_ea_entry *) ((char *)lee + reclen);
2175                 rc = mdt_rmfid_unlink(info, pfid, name, obj, ctime);
2176                 if (rc)
2177                         break;
2178         }
2179
2180 out:
2181         if (obj && !IS_ERR(obj))
2182                 mdt_object_put(info->mti_env, obj);
2183         if (info->mti_big_buf.lb_buf)
2184                 lu_buf_free(&info->mti_big_buf);
2185
2186         RETURN(rc);
2187 }
2188
2189 static int mdt_rmfid(struct tgt_session_info *tsi)
2190 {
2191         struct mdt_thread_info *mti = tsi2mdt_info(tsi);
2192         struct mdt_body *reqbody;
2193         struct lu_fid *fids, *rfids;
2194         int bufsize, rc;
2195         __u32 *rcs;
2196         int i, nr;
2197         ENTRY;
2198
2199         reqbody = req_capsule_client_get(tsi->tsi_pill, &RMF_MDT_BODY);
2200         if (reqbody == NULL)
2201                 RETURN(-EPROTO);
2202         bufsize = req_capsule_get_size(tsi->tsi_pill, &RMF_FID_ARRAY,
2203                                        RCL_CLIENT);
2204         nr = bufsize / sizeof(struct lu_fid);
2205         if (nr * sizeof(struct lu_fid) != bufsize)
2206                 RETURN(-EINVAL);
2207         req_capsule_set_size(tsi->tsi_pill, &RMF_RCS,
2208                              RCL_SERVER, nr * sizeof(__u32));
2209         req_capsule_set_size(tsi->tsi_pill, &RMF_FID_ARRAY,
2210                              RCL_SERVER, nr * sizeof(struct lu_fid));
2211         rc = req_capsule_server_pack(tsi->tsi_pill);
2212         if (rc)
2213                 GOTO(out, rc = err_serious(rc));
2214         fids = req_capsule_client_get(tsi->tsi_pill, &RMF_FID_ARRAY);
2215         if (fids == NULL)
2216                 RETURN(-EPROTO);
2217         rcs = req_capsule_server_get(tsi->tsi_pill, &RMF_RCS);
2218         LASSERT(rcs);
2219         rfids = req_capsule_server_get(tsi->tsi_pill, &RMF_FID_ARRAY);
2220         LASSERT(rfids);
2221
2222         mdt_init_ucred(mti, reqbody);
2223         for (i = 0; i < nr; i++) {
2224                 rfids[i] = fids[i];
2225                 rcs[i] = mdt_rmfid_one(mti, fids + i, reqbody->mbo_ctime);
2226         }
2227         mdt_exit_ucred(mti);
2228
2229 out:
2230         RETURN(rc);
2231 }
2232
2233 static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
2234                          void *karg, void __user *uarg);
2235
2236 static int mdt_set_info(struct tgt_session_info *tsi)
2237 {
2238         struct ptlrpc_request   *req = tgt_ses_req(tsi);
2239         char                    *key;
2240         void                    *val;
2241         int                      keylen, vallen, rc = 0;
2242
2243         ENTRY;
2244
2245         key = req_capsule_client_get(tsi->tsi_pill, &RMF_SETINFO_KEY);
2246         if (key == NULL) {
2247                 DEBUG_REQ(D_HA, req, "no set_info key");
2248                 RETURN(err_serious(-EFAULT));
2249         }
2250
2251         keylen = req_capsule_get_size(tsi->tsi_pill, &RMF_SETINFO_KEY,
2252                                       RCL_CLIENT);
2253
2254         val = req_capsule_client_get(tsi->tsi_pill, &RMF_SETINFO_VAL);
2255         if (val == NULL) {
2256                 DEBUG_REQ(D_HA, req, "no set_info val");
2257                 RETURN(err_serious(-EFAULT));
2258         }
2259
2260         vallen = req_capsule_get_size(tsi->tsi_pill, &RMF_SETINFO_VAL,
2261                                       RCL_CLIENT);
2262
2263         /* Swab any part of val you need to here */
2264         if (KEY_IS(KEY_READ_ONLY)) {
2265                 spin_lock(&req->rq_export->exp_lock);
2266                 if (*(__u32 *)val)
2267                         *exp_connect_flags_ptr(req->rq_export) |=
2268                                 OBD_CONNECT_RDONLY;
2269                 else
2270                         *exp_connect_flags_ptr(req->rq_export) &=
2271                                 ~OBD_CONNECT_RDONLY;
2272                 spin_unlock(&req->rq_export->exp_lock);
2273         } else if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
2274                 struct changelog_setinfo *cs = val;
2275
2276                 if (vallen != sizeof(*cs)) {
2277                         CERROR("%s: bad changelog_clear setinfo size %d\n",
2278                                tgt_name(tsi->tsi_tgt), vallen);
2279                         RETURN(-EINVAL);
2280                 }
2281                 if (ptlrpc_req_need_swab(req)) {
2282                         __swab64s(&cs->cs_recno);
2283                         __swab32s(&cs->cs_id);
2284                 }
2285
2286                 rc = mdt_iocontrol(OBD_IOC_CHANGELOG_CLEAR, req->rq_export,
2287                                    vallen, val, NULL);
2288         } else if (KEY_IS(KEY_EVICT_BY_NID)) {
2289                 if (vallen > 0)
2290                         obd_export_evict_by_nid(req->rq_export->exp_obd, val);
2291         } else {
2292                 RETURN(-EINVAL);
2293         }
2294         RETURN(rc);
2295 }
2296
2297 static int mdt_readpage(struct tgt_session_info *tsi)
2298 {
2299         struct mdt_thread_info  *info = mdt_th_info(tsi->tsi_env);
2300         struct mdt_object       *object = mdt_obj(tsi->tsi_corpus);
2301         struct lu_rdpg          *rdpg = &info->mti_u.rdpg.mti_rdpg;
2302         const struct mdt_body   *reqbody = tsi->tsi_mdt_body;
2303         struct mdt_body         *repbody;
2304         int                      rc;
2305         int                      i;
2306
2307         ENTRY;
2308
2309         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_READPAGE_PACK))
2310                 RETURN(err_serious(-ENOMEM));
2311
2312         repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_MDT_BODY);
2313         if (repbody == NULL || reqbody == NULL)
2314                 RETURN(err_serious(-EFAULT));
2315
2316         /*
2317          * prepare @rdpg before calling lower layers and transfer itself. Here
2318          * reqbody->size contains offset of where to start to read and
2319          * reqbody->nlink contains number bytes to read.
2320          */
2321         rdpg->rp_hash = reqbody->mbo_size;
2322         if (rdpg->rp_hash != reqbody->mbo_size) {
2323                 CERROR("Invalid hash: %#llx != %#llx\n",
2324                        rdpg->rp_hash, reqbody->mbo_size);
2325                 RETURN(-EFAULT);
2326         }
2327
2328         rdpg->rp_attrs = reqbody->mbo_mode;
2329         if (exp_connect_flags(tsi->tsi_exp) & OBD_CONNECT_64BITHASH)
2330                 rdpg->rp_attrs |= LUDA_64BITHASH;
2331         rdpg->rp_count  = min_t(unsigned int, reqbody->mbo_nlink,
2332                                 exp_max_brw_size(tsi->tsi_exp));
2333         rdpg->rp_npages = (rdpg->rp_count + PAGE_SIZE - 1) >>
2334                           PAGE_SHIFT;
2335         OBD_ALLOC(rdpg->rp_pages, rdpg->rp_npages * sizeof rdpg->rp_pages[0]);
2336         if (rdpg->rp_pages == NULL)
2337                 RETURN(-ENOMEM);
2338
2339         for (i = 0; i < rdpg->rp_npages; ++i) {
2340                 rdpg->rp_pages[i] = alloc_page(GFP_NOFS);
2341                 if (rdpg->rp_pages[i] == NULL)
2342                         GOTO(free_rdpg, rc = -ENOMEM);
2343         }
2344
2345         /* call lower layers to fill allocated pages with directory data */
2346         rc = mo_readpage(tsi->tsi_env, mdt_object_child(object), rdpg);
2347         if (rc < 0)
2348                 GOTO(free_rdpg, rc);
2349
2350         /* send pages to client */
2351         rc = tgt_sendpage(tsi, rdpg, rc);
2352
2353         EXIT;
2354 free_rdpg:
2355
2356         for (i = 0; i < rdpg->rp_npages; i++)
2357                 if (rdpg->rp_pages[i] != NULL)
2358                         __free_page(rdpg->rp_pages[i]);
2359         OBD_FREE(rdpg->rp_pages, rdpg->rp_npages * sizeof rdpg->rp_pages[0]);
2360
2361         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SENDPAGE))
2362                 RETURN(0);
2363
2364         return rc;
2365 }
2366
2367 static int mdt_fix_attr_ucred(struct mdt_thread_info *info, __u32 op)
2368 {
2369         struct lu_ucred *uc = mdt_ucred_check(info);
2370         struct lu_attr *attr = &info->mti_attr.ma_attr;
2371
2372         if (uc == NULL)
2373                 return -EINVAL;
2374
2375         if (op != REINT_SETATTR) {
2376                 if ((attr->la_valid & LA_UID) && (attr->la_uid != -1))
2377                         attr->la_uid = uc->uc_fsuid;
2378                 /* for S_ISGID, inherit gid from his parent, such work will be
2379                  * done in cmm/mdd layer, here set all cases as uc->uc_fsgid. */
2380                 if ((attr->la_valid & LA_GID) && (attr->la_gid != -1))
2381                         attr->la_gid = uc->uc_fsgid;
2382         }
2383
2384         return 0;
2385 }
2386
2387 static void mdt_preset_secctx_size(struct mdt_thread_info *info)
2388 {
2389         struct req_capsule *pill = info->mti_pill;
2390
2391         if (req_capsule_has_field(pill, &RMF_FILE_SECCTX,
2392                                   RCL_SERVER) &&
2393             req_capsule_has_field(pill, &RMF_FILE_SECCTX_NAME,
2394                                   RCL_CLIENT)) {
2395                 if (req_capsule_get_size(pill, &RMF_FILE_SECCTX_NAME,
2396                                          RCL_CLIENT) != 0) {
2397                         /* pre-set size in server part with max size */
2398                         req_capsule_set_size(pill, &RMF_FILE_SECCTX,
2399                                              RCL_SERVER,
2400                                              info->mti_mdt->mdt_max_ea_size);
2401                 } else {
2402                         req_capsule_set_size(pill, &RMF_FILE_SECCTX,
2403                                              RCL_SERVER, 0);
2404                 }
2405         }
2406
2407 }
2408
2409 static int mdt_reint_internal(struct mdt_thread_info *info,
2410                               struct mdt_lock_handle *lhc,
2411                               __u32 op)
2412 {
2413         struct req_capsule      *pill = info->mti_pill;
2414         struct mdt_body         *repbody;
2415         int                      rc = 0, rc2;
2416
2417         ENTRY;
2418
2419         rc = mdt_reint_unpack(info, op);
2420         if (rc != 0) {
2421                 CERROR("Can't unpack reint, rc %d\n", rc);
2422                 RETURN(err_serious(rc));
2423         }
2424
2425         /* for replay (no_create) lmm is not needed, client has it already */
2426         if (req_capsule_has_field(pill, &RMF_MDT_MD, RCL_SERVER))
2427                 req_capsule_set_size(pill, &RMF_MDT_MD, RCL_SERVER,
2428                                      DEF_REP_MD_SIZE);
2429
2430         /* llog cookies are always 0, the field is kept for compatibility */
2431         if (req_capsule_has_field(pill, &RMF_LOGCOOKIES, RCL_SERVER))
2432                 req_capsule_set_size(pill, &RMF_LOGCOOKIES, RCL_SERVER, 0);
2433
2434         /* Set ACL reply buffer size as LUSTRE_POSIX_ACL_MAX_SIZE_OLD
2435          * by default. If the target object has more ACL entries, then
2436          * enlarge the buffer when necessary. */
2437         if (req_capsule_has_field(pill, &RMF_ACL, RCL_SERVER))
2438                 req_capsule_set_size(pill, &RMF_ACL, RCL_SERVER,
2439                                      LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
2440
2441         mdt_preset_secctx_size(info);
2442
2443         rc = req_capsule_server_pack(pill);
2444         if (rc != 0) {
2445                 CERROR("Can't pack response, rc %d\n", rc);
2446                 RETURN(err_serious(rc));
2447         }
2448
2449         if (req_capsule_has_field(pill, &RMF_MDT_BODY, RCL_SERVER)) {
2450                 repbody = req_capsule_server_get(pill, &RMF_MDT_BODY);
2451                 LASSERT(repbody);
2452                 repbody->mbo_eadatasize = 0;
2453                 repbody->mbo_aclsize = 0;
2454         }
2455
2456         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_REINT_DELAY, 10);
2457
2458         /* for replay no cookkie / lmm need, because client have this already */
2459         if (info->mti_spec.no_create)
2460                 if (req_capsule_has_field(pill, &RMF_MDT_MD, RCL_SERVER))
2461                         req_capsule_set_size(pill, &RMF_MDT_MD, RCL_SERVER, 0);
2462
2463         rc = mdt_init_ucred_reint(info);
2464         if (rc)
2465                 GOTO(out_shrink, rc);
2466
2467         rc = mdt_fix_attr_ucred(info, op);
2468         if (rc != 0)
2469                 GOTO(out_ucred, rc = err_serious(rc));
2470
2471         rc = mdt_check_resent(info, mdt_reconstruct, lhc);
2472         if (rc < 0) {
2473                 GOTO(out_ucred, rc);
2474         } else if (rc == 1) {
2475                 DEBUG_REQ(D_INODE, mdt_info_req(info), "resent opt.");
2476                 rc = lustre_msg_get_status(mdt_info_req(info)->rq_repmsg);
2477                 GOTO(out_ucred, rc);
2478         }
2479         rc = mdt_reint_rec(info, lhc);
2480         EXIT;
2481 out_ucred:
2482         mdt_exit_ucred(info);
2483 out_shrink:
2484         mdt_client_compatibility(info);
2485
2486         rc2 = mdt_fix_reply(info);
2487         if (rc == 0)
2488                 rc = rc2;
2489
2490         /*
2491          * Data-on-MDT optimization - read data along with OPEN and return it
2492          * in reply. Do that only if we have both DOM and LAYOUT locks.
2493          */
2494         if (rc == 0 && op == REINT_OPEN && !req_is_replay(pill->rc_req) &&
2495             info->mti_attr.ma_lmm != NULL &&
2496             mdt_lmm_dom_entry(info->mti_attr.ma_lmm) == LMM_DOM_ONLY) {
2497                 rc = mdt_dom_read_on_open(info, info->mti_mdt,
2498                                           &lhc->mlh_reg_lh);
2499         }
2500
2501         return rc;
2502 }
2503
2504 static long mdt_reint_opcode(struct ptlrpc_request *req,
2505                              const struct req_format **fmt)
2506 {
2507         struct mdt_device       *mdt;
2508         struct mdt_rec_reint    *rec;
2509         long                     opc;
2510
2511         rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
2512         if (rec != NULL) {
2513                 opc = rec->rr_opcode;
2514                 DEBUG_REQ(D_INODE, req, "reint opt = %ld", opc);
2515                 if (opc < REINT_MAX && fmt[opc] != NULL)
2516                         req_capsule_extend(&req->rq_pill, fmt[opc]);
2517                 else {
2518                         mdt = mdt_exp2dev(req->rq_export);
2519                         CERROR("%s: Unsupported opcode '%ld' from client '%s':"
2520                                " rc = %d\n", req->rq_export->exp_obd->obd_name,
2521                                opc, mdt->mdt_ldlm_client->cli_name, -EFAULT);
2522                         opc = err_serious(-EFAULT);
2523                 }
2524         } else {
2525                 opc = err_serious(-EFAULT);
2526         }
2527         return opc;
2528 }
2529
2530 static int mdt_reint(struct tgt_session_info *tsi)
2531 {
2532         long opc;
2533         int  rc;
2534         static const struct req_format *reint_fmts[REINT_MAX] = {
2535                 [REINT_SETATTR]  = &RQF_MDS_REINT_SETATTR,
2536                 [REINT_CREATE]   = &RQF_MDS_REINT_CREATE,
2537                 [REINT_LINK]     = &RQF_MDS_REINT_LINK,
2538                 [REINT_UNLINK]   = &RQF_MDS_REINT_UNLINK,
2539                 [REINT_RENAME]   = &RQF_MDS_REINT_RENAME,
2540                 [REINT_OPEN]     = &RQF_MDS_REINT_OPEN,
2541                 [REINT_SETXATTR] = &RQF_MDS_REINT_SETXATTR,
2542                 [REINT_RMENTRY]  = &RQF_MDS_REINT_UNLINK,
2543                 [REINT_MIGRATE]  = &RQF_MDS_REINT_MIGRATE,
2544                 [REINT_RESYNC]   = &RQF_MDS_REINT_RESYNC,
2545         };
2546
2547         ENTRY;
2548
2549         opc = mdt_reint_opcode(tgt_ses_req(tsi), reint_fmts);
2550         if (opc >= 0) {
2551                 struct mdt_thread_info *info = tsi2mdt_info(tsi);
2552                 /*
2553                  * No lock possible here from client to pass it to reint code
2554                  * path.
2555                  */
2556                 rc = mdt_reint_internal(info, NULL, opc);
2557                 mdt_thread_info_fini(info);
2558         } else {
2559                 rc = opc;
2560         }
2561
2562         tsi->tsi_reply_fail_id = OBD_FAIL_MDS_REINT_NET_REP;
2563         RETURN(rc);
2564 }
2565
2566 /* this should sync the whole device */
2567 int mdt_device_sync(const struct lu_env *env, struct mdt_device *mdt)
2568 {
2569         struct dt_device *dt = mdt->mdt_bottom;
2570         int rc;
2571         ENTRY;
2572
2573         rc = dt->dd_ops->dt_sync(env, dt);
2574         RETURN(rc);
2575 }
2576
2577 /* this should sync this object */
2578 static int mdt_object_sync(const struct lu_env *env, struct obd_export *exp,
2579                            struct mdt_object *mo)
2580 {
2581         int rc;
2582
2583         ENTRY;
2584
2585         if (!mdt_object_exists(mo)) {
2586                 CWARN("%s: non existing object "DFID": rc = %d\n",
2587                       exp->exp_obd->obd_name, PFID(mdt_object_fid(mo)),
2588                       -ESTALE);
2589                 RETURN(-ESTALE);
2590         }
2591
2592         rc = mo_object_sync(env, mdt_object_child(mo));
2593
2594         RETURN(rc);
2595 }
2596
2597 static int mdt_sync(struct tgt_session_info *tsi)
2598 {
2599         struct ptlrpc_request   *req = tgt_ses_req(tsi);
2600         struct req_capsule      *pill = tsi->tsi_pill;
2601         struct mdt_body         *body;
2602         int                      rc;
2603
2604         ENTRY;
2605
2606         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_SYNC_PACK))
2607                 RETURN(err_serious(-ENOMEM));
2608
2609         if (fid_seq(&tsi->tsi_mdt_body->mbo_fid1) == 0) {
2610                 rc = mdt_device_sync(tsi->tsi_env, mdt_exp2dev(tsi->tsi_exp));
2611         } else {
2612                 struct mdt_thread_info *info = tsi2mdt_info(tsi);
2613
2614                 if (unlikely(info->mti_object == NULL))
2615                         RETURN(-EPROTO);
2616
2617                 /* sync an object */
2618                 rc = mdt_object_sync(tsi->tsi_env, tsi->tsi_exp,
2619                                      info->mti_object);
2620                 if (rc == 0) {
2621                         const struct lu_fid *fid;
2622                         struct lu_attr *la = &info->mti_attr.ma_attr;
2623
2624                         info->mti_attr.ma_need = MA_INODE;
2625                         info->mti_attr.ma_valid = 0;
2626                         rc = mdt_attr_get_complex(info, info->mti_object,
2627                                                   &info->mti_attr);
2628                         if (rc == 0) {
2629                                 body = req_capsule_server_get(pill,
2630                                                               &RMF_MDT_BODY);
2631                                 fid = mdt_object_fid(info->mti_object);
2632                                 mdt_pack_attr2body(info, body, la, fid);
2633                         }
2634                 }
2635                 mdt_thread_info_fini(info);
2636         }
2637         if (rc == 0)
2638                 mdt_counter_incr(req, LPROC_MDT_SYNC);
2639
2640         RETURN(rc);
2641 }
2642
2643 static int mdt_data_sync(struct tgt_session_info *tsi)
2644 {
2645         struct mdt_thread_info *info;
2646         struct mdt_device *mdt = mdt_exp2dev(tsi->tsi_exp);
2647         struct ost_body *body = tsi->tsi_ost_body;
2648         struct ost_body *repbody;
2649         struct mdt_object *mo = NULL;
2650         struct md_attr *ma;
2651         int rc = 0;
2652
2653         ENTRY;
2654
2655         repbody = req_capsule_server_get(tsi->tsi_pill, &RMF_OST_BODY);
2656
2657         /* if no fid is specified then do nothing,
2658          * device sync is done via MDS_SYNC */
2659         if (fid_is_zero(&tsi->tsi_fid))
2660                 RETURN(0);
2661
2662         mo = mdt_object_find(tsi->tsi_env, mdt, &tsi->tsi_fid);
2663         if (IS_ERR(mo))
2664                 RETURN(PTR_ERR(mo));
2665
2666         rc = mdt_object_sync(tsi->tsi_env, tsi->tsi_exp, mo);
2667         if (rc)
2668                 GOTO(put, rc);
2669
2670         repbody->oa.o_oi = body->oa.o_oi;
2671         repbody->oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2672
2673         info = tsi2mdt_info(tsi);
2674         ma = &info->mti_attr;
2675         ma->ma_need = MA_INODE;
2676         ma->ma_valid = 0;
2677         rc = mdt_attr_get_complex(info, mo, ma);
2678         if (rc == 0)
2679                 obdo_from_la(&repbody->oa, &ma->ma_attr, VALID_FLAGS);
2680         else
2681                 rc = 0;
2682         mdt_thread_info_fini(info);
2683
2684         EXIT;
2685 put:
2686         if (mo != NULL)
2687                 mdt_object_put(tsi->tsi_env, mo);
2688         return rc;
2689 }
2690
2691 /*
2692  * Handle quota control requests to consult current usage/limit, but also
2693  * to configure quota enforcement
2694  */
2695 static int mdt_quotactl(struct tgt_session_info *tsi)
2696 {
2697         struct obd_export       *exp  = tsi->tsi_exp;
2698         struct req_capsule      *pill = tsi->tsi_pill;
2699         struct obd_quotactl     *oqctl, *repoqc;
2700         int                      id, rc;
2701         struct mdt_device       *mdt = mdt_exp2dev(exp);
2702         struct lu_device        *qmt = mdt->mdt_qmt_dev;
2703         struct lu_nodemap       *nodemap;
2704         ENTRY;
2705
2706         oqctl = req_capsule_client_get(pill, &RMF_OBD_QUOTACTL);
2707         if (oqctl == NULL)
2708                 RETURN(err_serious(-EPROTO));
2709
2710         rc = req_capsule_server_pack(pill);
2711         if (rc)
2712                 RETURN(err_serious(rc));
2713
2714         nodemap = nodemap_get_from_exp(exp);
2715         if (IS_ERR(nodemap))
2716                 RETURN(PTR_ERR(nodemap));
2717
2718         switch (oqctl->qc_cmd) {
2719                 /* master quotactl */
2720         case Q_SETINFO:
2721         case Q_SETQUOTA:
2722         case LUSTRE_Q_SETDEFAULT:
2723                 if (!nodemap_can_setquota(nodemap))
2724                         GOTO(out_nodemap, rc = -EPERM);
2725         case Q_GETINFO:
2726         case Q_GETQUOTA:
2727         case LUSTRE_Q_GETDEFAULT:
2728                 if (qmt == NULL)
2729                         GOTO(out_nodemap, rc = -EOPNOTSUPP);
2730                 /* slave quotactl */
2731         case Q_GETOINFO:
2732         case Q_GETOQUOTA:
2733                 break;
2734         default:
2735                 CERROR("Unsupported quotactl command: %d\n", oqctl->qc_cmd);
2736                 GOTO(out_nodemap, rc = -EFAULT);
2737         }
2738
2739         id = oqctl->qc_id;
2740         switch (oqctl->qc_type) {
2741         case USRQUOTA:
2742                 id = nodemap_map_id(nodemap, NODEMAP_UID,
2743                                     NODEMAP_CLIENT_TO_FS, id);
2744                 break;
2745         case GRPQUOTA:
2746                 id = nodemap_map_id(nodemap, NODEMAP_GID,
2747                                     NODEMAP_CLIENT_TO_FS, id);
2748                 break;
2749         case PRJQUOTA:
2750                 /* todo: check/map project id */
2751                 id = oqctl->qc_id;
2752                 break;
2753         default:
2754                 GOTO(out_nodemap, rc = -EOPNOTSUPP);
2755         }
2756         repoqc = req_capsule_server_get(pill, &RMF_OBD_QUOTACTL);
2757         if (repoqc == NULL)
2758                 GOTO(out_nodemap, rc = err_serious(-EFAULT));
2759
2760         if (oqctl->qc_cmd == Q_SETINFO || oqctl->qc_cmd == Q_SETQUOTA)
2761                 barrier_exit(tsi->tsi_tgt->lut_bottom);
2762
2763         if (oqctl->qc_id != id)
2764                 swap(oqctl->qc_id, id);
2765
2766         if (oqctl->qc_cmd == Q_SETINFO || oqctl->qc_cmd == Q_SETQUOTA) {
2767                 if (unlikely(!barrier_entry(tsi->tsi_tgt->lut_bottom)))
2768                         RETURN(-EINPROGRESS);
2769         }
2770
2771         switch (oqctl->qc_cmd) {
2772
2773         case Q_GETINFO:
2774         case Q_SETINFO:
2775         case Q_SETQUOTA:
2776         case Q_GETQUOTA:
2777         case LUSTRE_Q_SETDEFAULT:
2778         case LUSTRE_Q_GETDEFAULT:
2779                 /* forward quotactl request to QMT */
2780                 rc = qmt_hdls.qmth_quotactl(tsi->tsi_env, qmt, oqctl);
2781                 break;
2782
2783         case Q_GETOINFO:
2784         case Q_GETOQUOTA:
2785                 /* slave quotactl */
2786                 rc = lquotactl_slv(tsi->tsi_env, tsi->tsi_tgt->lut_bottom,
2787                                    oqctl);
2788                 break;
2789
2790         default:
2791                 CERROR("Unsupported quotactl command: %d\n", oqctl->qc_cmd);
2792                 GOTO(out_nodemap, rc = -EFAULT);
2793         }
2794
2795         if (oqctl->qc_id != id)
2796                 swap(oqctl->qc_id, id);
2797
2798         *repoqc = *oqctl;
2799
2800         EXIT;
2801
2802 out_nodemap:
2803         nodemap_putref(nodemap);
2804
2805         return rc;
2806 }
2807
2808 /** clone llog ctxt from child (mdd)
2809  * This allows remote llog (replicator) access.
2810  * We can either pass all llog RPCs (eg mdt_llog_create) on to child where the
2811  * context was originally set up, or we can handle them directly.
2812  * I choose the latter, but that means I need any llog
2813  * contexts set up by child to be accessable by the mdt.  So we clone the
2814  * context into our context list here.
2815  */
2816 static int mdt_llog_ctxt_clone(const struct lu_env *env, struct mdt_device *mdt,
2817                                int idx)
2818 {
2819         struct md_device  *next = mdt->mdt_child;
2820         struct llog_ctxt *ctxt;
2821         int rc;
2822
2823         if (!llog_ctxt_null(mdt2obd_dev(mdt), idx))
2824                 return 0;
2825
2826         rc = next->md_ops->mdo_llog_ctxt_get(env, next, idx, (void **)&ctxt);
2827         if (rc || ctxt == NULL) {
2828                 return 0;
2829         }
2830
2831         rc = llog_group_set_ctxt(&mdt2obd_dev(mdt)->obd_olg, ctxt, idx);
2832         if (rc)
2833                 CERROR("Can't set mdt ctxt %d\n", rc);
2834
2835         return rc;
2836 }
2837
2838 static int mdt_llog_ctxt_unclone(const struct lu_env *env,
2839                                  struct mdt_device *mdt, int idx)
2840 {
2841         struct llog_ctxt *ctxt;
2842
2843         ctxt = llog_get_context(mdt2obd_dev(mdt), idx);
2844         if (ctxt == NULL)
2845                 return 0;
2846         /* Put once for the get we just did, and once for the clone */
2847         llog_ctxt_put(ctxt);
2848         llog_ctxt_put(ctxt);
2849         return 0;
2850 }
2851
2852 /*
2853  * sec context handlers
2854  */
2855 static int mdt_sec_ctx_handle(struct tgt_session_info *tsi)
2856 {
2857         CFS_FAIL_TIMEOUT(OBD_FAIL_SEC_CTX_HDL_PAUSE, cfs_fail_val);
2858
2859         return 0;
2860 }
2861
2862 /*
2863  * quota request handlers
2864  */
2865 static int mdt_quota_dqacq(struct tgt_session_info *tsi)
2866 {
2867         struct mdt_device       *mdt = mdt_exp2dev(tsi->tsi_exp);
2868         struct lu_device        *qmt = mdt->mdt_qmt_dev;
2869         int                      rc;
2870         ENTRY;
2871
2872         if (qmt == NULL)
2873                 RETURN(err_serious(-EOPNOTSUPP));
2874
2875         rc = qmt_hdls.qmth_dqacq(tsi->tsi_env, qmt, tgt_ses_req(tsi));
2876         RETURN(rc);
2877 }
2878
2879 struct mdt_object *mdt_object_new(const struct lu_env *env,
2880                                   struct mdt_device *d,
2881                                   const struct lu_fid *f)
2882 {
2883         struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
2884         struct lu_object *o;
2885         struct mdt_object *m;
2886         ENTRY;
2887
2888         CDEBUG(D_INFO, "Allocate object for "DFID"\n", PFID(f));
2889         o = lu_object_find(env, &d->mdt_lu_dev, f, &conf);
2890         if (unlikely(IS_ERR(o)))
2891                 m = (struct mdt_object *)o;
2892         else
2893                 m = mdt_obj(o);
2894         RETURN(m);
2895 }
2896
2897 struct mdt_object *mdt_object_find(const struct lu_env *env,
2898                                    struct mdt_device *d,
2899                                    const struct lu_fid *f)
2900 {
2901         struct lu_object *o;
2902         struct mdt_object *m;
2903         ENTRY;
2904
2905         CDEBUG(D_INFO, "Find object for "DFID"\n", PFID(f));
2906         o = lu_object_find(env, &d->mdt_lu_dev, f, NULL);
2907         if (unlikely(IS_ERR(o)))
2908                 m = (struct mdt_object *)o;
2909         else
2910                 m = mdt_obj(o);
2911
2912         RETURN(m);
2913 }
2914
2915 /**
2916  * Asyncronous commit for mdt device.
2917  *
2918  * Pass asynchonous commit call down the MDS stack.
2919  *
2920  * \param env environment
2921  * \param mdt the mdt device
2922  */
2923 static void mdt_device_commit_async(const struct lu_env *env,
2924                                     struct mdt_device *mdt)
2925 {
2926         struct dt_device *dt = mdt->mdt_bottom;
2927         int rc;
2928         ENTRY;
2929
2930         rc = dt->dd_ops->dt_commit_async(env, dt);
2931         if (unlikely(rc != 0))
2932                 CWARN("%s: async commit start failed: rc = %d\n",
2933                       mdt_obd_name(mdt), rc);
2934         atomic_inc(&mdt->mdt_async_commit_count);
2935         EXIT;
2936 }
2937
2938 /**
2939  * Mark the lock as "synchonous".
2940  *
2941  * Mark the lock to deffer transaction commit to the unlock time.
2942  *
2943  * \param lock the lock to mark as "synchonous"
2944  *
2945  * \see mdt_is_lock_sync
2946  * \see mdt_save_lock
2947  */
2948 static inline void mdt_set_lock_sync(struct ldlm_lock *lock)
2949 {
2950         lock->l_ast_data = (void*)1;
2951 }
2952
2953 /**
2954  * Check whehter the lock "synchonous" or not.
2955  *
2956  * \param lock the lock to check
2957  * \retval 1 the lock is "synchonous"
2958  * \retval 0 the lock isn't "synchronous"
2959  *
2960  * \see mdt_set_lock_sync
2961  * \see mdt_save_lock
2962  */
2963 static inline int mdt_is_lock_sync(struct ldlm_lock *lock)
2964 {
2965         return lock->l_ast_data != NULL;
2966 }
2967
2968 /**
2969  * Blocking AST for mdt locks.
2970  *
2971  * Starts transaction commit if in case of COS lock conflict or
2972  * deffers such a commit to the mdt_save_lock.
2973  *
2974  * \param lock the lock which blocks a request or cancelling lock
2975  * \param desc unused
2976  * \param data unused
2977  * \param flag indicates whether this cancelling or blocking callback
2978  * \retval 0
2979  * \see ldlm_blocking_ast_nocheck
2980  */
2981 int mdt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
2982                      void *data, int flag)
2983 {
2984         struct obd_device *obd = ldlm_lock_to_ns(lock)->ns_obd;
2985         struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev);
2986         struct ldlm_cb_set_arg *arg = data;
2987         bool commit_async = false;
2988         int rc;
2989         ENTRY;
2990
2991         if (flag == LDLM_CB_CANCELING)
2992                 RETURN(0);
2993
2994         lock_res_and_lock(lock);
2995         if (lock->l_blocking_ast != mdt_blocking_ast) {
2996                 unlock_res_and_lock(lock);
2997                 RETURN(0);
2998         }
2999
3000         /* A blocking ast may be sent from ldlm_lock_decref_internal
3001          * when the last reference to a local lock was released and
3002          * during blocking event from ldlm_work_bl_ast_lock().
3003          * The 'data' parameter is l_ast_data in the first case and
3004          * callback arguments in the second one. Distinguish them by that.
3005          */
3006         if (!data || data == lock->l_ast_data || !arg->bl_desc)
3007                 goto skip_cos_checks;
3008
3009         if (lock->l_req_mode & (LCK_PW | LCK_EX)) {
3010                 if (mdt_cos_is_enabled(mdt)) {
3011                         if (!arg->bl_desc->bl_same_client)
3012                                 mdt_set_lock_sync(lock);
3013                 } else if (mdt_slc_is_enabled(mdt) &&
3014                            arg->bl_desc->bl_cos_incompat) {
3015                         mdt_set_lock_sync(lock);
3016                         /*
3017                          * we may do extra commit here, but there is a small
3018                          * window to miss a commit: lock was unlocked (saved),
3019                          * then a conflict lock queued and we come here, but
3020                          * REP-ACK not received, so lock was not converted to
3021                          * COS mode yet.
3022                          * Fortunately this window is quite small, so the
3023                          * extra commit should be rare (not to say distributed
3024                          * operation is rare too).
3025                          */
3026                         commit_async = true;
3027                 }
3028         } else if (lock->l_req_mode == LCK_COS) {
3029                 commit_async = true;
3030         }
3031
3032 skip_cos_checks:
3033         rc = ldlm_blocking_ast_nocheck(lock);
3034
3035         if (commit_async) {
3036                 struct lu_env env;
3037
3038                 rc = lu_env_init(&env, LCT_LOCAL);
3039                 if (unlikely(rc != 0))
3040                         CWARN("%s: lu_env initialization failed, cannot "
3041                               "start asynchronous commit: rc = %d\n",
3042                               obd->obd_name, rc);
3043                 else
3044                         mdt_device_commit_async(&env, mdt);
3045                 lu_env_fini(&env);
3046         }
3047         RETURN(rc);
3048 }
3049
3050 /*
3051  * Blocking AST for cross-MDT lock
3052  *
3053  * Discard lock from uncommitted_slc_locks and cancel it.
3054  *
3055  * \param lock  the lock which blocks a request or cancelling lock
3056  * \param desc  unused
3057  * \param data  unused
3058  * \param flag  indicates whether this cancelling or blocking callback
3059  * \retval      0 on success
3060  * \retval      negative number on error
3061  */
3062 int mdt_remote_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
3063                             void *data, int flag)
3064 {
3065         int rc = 0;
3066         ENTRY;
3067
3068         switch (flag) {
3069         case LDLM_CB_BLOCKING: {
3070                 struct lustre_handle lockh;
3071
3072                 ldlm_lock2handle(lock, &lockh);
3073                 rc = ldlm_cli_cancel(&lockh,
3074                         ldlm_is_atomic_cb(lock) ? 0 : LCF_ASYNC);
3075                 if (rc < 0) {
3076                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
3077                         RETURN(rc);
3078                 }
3079                 break;
3080         }
3081         case LDLM_CB_CANCELING: {
3082                 struct obd_device *obd = ldlm_lock_to_ns(lock)->ns_obd;
3083                 struct mdt_device *mdt =
3084                                 mdt_dev(obd->obd_lu_dev->ld_site->ls_top_dev);
3085
3086                 LDLM_DEBUG(lock, "Revoke remote lock\n");
3087
3088                 /* discard slc lock here so that it can be cleaned anytime,
3089                  * especially for cleanup_resource() */
3090                 tgt_discard_slc_lock(&mdt->mdt_lut, lock);
3091
3092                 /* once we cache lock, l_ast_data is set to mdt_object */
3093                 if (lock->l_ast_data != NULL) {
3094                         struct mdt_object *mo = lock->l_ast_data;
3095                         struct lu_env env;
3096
3097                         rc = lu_env_init(&env, LCT_MD_THREAD);
3098                         if (unlikely(rc != 0)) {
3099                                 CWARN("%s: lu_env initialization failed, object"
3100                                       "%p "DFID" is leaked!\n",
3101                                       obd->obd_name, mo,
3102                                       PFID(mdt_object_fid(mo)));
3103                                 RETURN(rc);
3104                         }
3105
3106                         if (lock->l_policy_data.l_inodebits.bits &
3107                             (MDS_INODELOCK_XATTR | MDS_INODELOCK_UPDATE)) {
3108                                 rc = mo_invalidate(&env, mdt_object_child(mo));
3109                                 mo->mot_cache_attr = 0;
3110                         }
3111                         mdt_object_put(&env, mo);
3112                         lu_env_fini(&env);
3113                 }
3114                 break;
3115         }
3116         default:
3117                 LBUG();
3118         }
3119
3120         RETURN(rc);
3121 }
3122
3123 int mdt_check_resent_lock(struct mdt_thread_info *info,
3124                           struct mdt_object *mo,
3125                           struct mdt_lock_handle *lhc)
3126 {
3127         /* the lock might already be gotten in ldlm_handle_enqueue() */
3128         if (unlikely(lustre_handle_is_used(&lhc->mlh_reg_lh))) {
3129                 struct ptlrpc_request *req = mdt_info_req(info);
3130                 struct ldlm_lock      *lock;
3131
3132                 lock = ldlm_handle2lock(&lhc->mlh_reg_lh);
3133                 LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT);
3134                 if (lock == NULL) {
3135                         /* Lock is pinned by ldlm_handle_enqueue0() as it is
3136                          * a resend case, however, it could be already destroyed
3137                          * due to client eviction or a raced cancel RPC. */
3138                         LDLM_DEBUG_NOLOCK("Invalid lock handle %#llx",
3139                                           lhc->mlh_reg_lh.cookie);
3140                         RETURN(-ESTALE);
3141                 }
3142
3143                 if (!fid_res_name_eq(mdt_object_fid(mo),
3144                                      &lock->l_resource->lr_name)) {
3145                         CWARN("%s: Although resent, but still not "
3146                               "get child lock:"DFID"\n",
3147                               info->mti_exp->exp_obd->obd_name,
3148                               PFID(mdt_object_fid(mo)));
3149                         LDLM_LOCK_PUT(lock);
3150                         RETURN(-EPROTO);
3151                 }
3152                 LDLM_LOCK_PUT(lock);
3153                 return 0;
3154         }
3155         return 1;
3156 }
3157
3158 static void mdt_remote_object_lock_created_cb(struct ldlm_lock *lock)
3159 {
3160         mdt_object_get(NULL, lock->l_ast_data);
3161 }
3162
3163 int mdt_remote_object_lock_try(struct mdt_thread_info *mti,
3164                                struct mdt_object *o, const struct lu_fid *fid,
3165                                struct lustre_handle *lh, enum ldlm_mode mode,
3166                                __u64 *ibits, __u64 trybits, bool cache)
3167 {
3168         struct ldlm_enqueue_info *einfo = &mti->mti_remote_einfo;
3169         union ldlm_policy_data *policy = &mti->mti_policy;
3170         struct ldlm_res_id *res_id = &mti->mti_res_id;
3171         int rc = 0;
3172         ENTRY;
3173
3174         LASSERT(mdt_object_remote(o));
3175
3176         fid_build_reg_res_name(fid, res_id);
3177
3178         memset(einfo, 0, sizeof(*einfo));
3179         einfo->ei_type = LDLM_IBITS;
3180         einfo->ei_mode = mode;
3181         einfo->ei_cb_bl = mdt_remote_blocking_ast;
3182         einfo->ei_cb_cp = ldlm_completion_ast;
3183         einfo->ei_enq_slave = 0;
3184         einfo->ei_res_id = res_id;
3185
3186         if (cache) {
3187                 /*
3188                  * if we cache lock, couple lock with mdt_object, so that object
3189                  * can be easily found in lock ASTs.
3190                  */
3191                 einfo->ei_cbdata = o;
3192                 einfo->ei_cb_created = mdt_remote_object_lock_created_cb;
3193         }
3194
3195         memset(policy, 0, sizeof(*policy));
3196         policy->l_inodebits.bits = *ibits;
3197         policy->l_inodebits.try_bits = trybits;
3198
3199         rc = mo_object_lock(mti->mti_env, mdt_object_child(o), lh, einfo,
3200                             policy);
3201
3202         /* Return successfully acquired bits to a caller */
3203         if (rc == 0) {
3204                 struct ldlm_lock *lock = ldlm_handle2lock(lh);
3205
3206                 LASSERT(lock);
3207                 *ibits = lock->l_policy_data.l_inodebits.bits;
3208                 LDLM_LOCK_PUT(lock);
3209         }
3210         RETURN(rc);
3211 }
3212
3213 int mdt_remote_object_lock(struct mdt_thread_info *mti, struct mdt_object *o,
3214                            const struct lu_fid *fid, struct lustre_handle *lh,
3215                            enum ldlm_mode mode, __u64 ibits, bool cache)
3216 {
3217         return mdt_remote_object_lock_try(mti, o, fid, lh, mode, &ibits, 0,
3218                                           cache);
3219 }
3220
3221 int mdt_object_local_lock(struct mdt_thread_info *info, struct mdt_object *o,
3222                           struct mdt_lock_handle *lh, __u64 *ibits,
3223                           __u64 trybits, bool cos_incompat)
3224 {
3225         struct ldlm_namespace *ns = info->mti_mdt->mdt_namespace;
3226         union ldlm_policy_data *policy = &info->mti_policy;
3227         struct ldlm_res_id *res_id = &info->mti_res_id;
3228         __u64 dlmflags = 0, *cookie = NULL;
3229         int rc;
3230         ENTRY;
3231
3232         LASSERT(!lustre_handle_is_used(&lh->mlh_reg_lh));
3233         LASSERT(!lustre_handle_is_used(&lh->mlh_pdo_lh));
3234         LASSERT(lh->mlh_reg_mode != LCK_MINMODE);
3235         LASSERT(lh->mlh_type != MDT_NUL_LOCK);
3236
3237         if (cos_incompat) {
3238                 LASSERT(lh->mlh_reg_mode == LCK_PW ||
3239                         lh->mlh_reg_mode == LCK_EX);
3240                 dlmflags |= LDLM_FL_COS_INCOMPAT;
3241         } else if (mdt_cos_is_enabled(info->mti_mdt)) {
3242                 dlmflags |= LDLM_FL_COS_ENABLED;
3243         }
3244
3245         /* Only enqueue LOOKUP lock for remote object */
3246         LASSERT(ergo(mdt_object_remote(o), *ibits == MDS_INODELOCK_LOOKUP));
3247
3248         if (lh->mlh_type == MDT_PDO_LOCK) {
3249                 /* check for exists after object is locked */
3250                 if (mdt_object_exists(o) == 0) {
3251                         /* Non-existent object shouldn't have PDO lock */
3252                         RETURN(-ESTALE);
3253                 } else {
3254                         /* Non-dir object shouldn't have PDO lock */
3255                         if (!S_ISDIR(lu_object_attr(&o->mot_obj)))
3256                                 RETURN(-ENOTDIR);
3257                 }
3258         }
3259
3260         fid_build_reg_res_name(mdt_object_fid(o), res_id);
3261         dlmflags |= LDLM_FL_ATOMIC_CB;
3262
3263         if (info->mti_exp)
3264                 cookie = &info->mti_exp->exp_handle.h_cookie;
3265
3266         /*
3267          * Take PDO lock on whole directory and build correct @res_id for lock
3268          * on part of directory.
3269          */
3270         if (lh->mlh_pdo_hash != 0) {
3271                 LASSERT(lh->mlh_type == MDT_PDO_LOCK);
3272                 mdt_lock_pdo_mode(info, o, lh);
3273                 if (lh->mlh_pdo_mode != LCK_NL) {
3274                         /*
3275                          * Do not use LDLM_FL_LOCAL_ONLY for parallel lock, it
3276                          * is never going to be sent to client and we do not
3277                          * want it slowed down due to possible cancels.
3278                          */
3279                         policy->l_inodebits.bits =
3280                                 *ibits & MDS_INODELOCK_UPDATE;
3281                         policy->l_inodebits.try_bits =
3282                                 trybits & MDS_INODELOCK_UPDATE;
3283                         /* at least one of them should be set */
3284                         LASSERT(policy->l_inodebits.bits |
3285                                 policy->l_inodebits.try_bits);
3286                         rc = mdt_fid_lock(info->mti_env, ns, &lh->mlh_pdo_lh,
3287                                           lh->mlh_pdo_mode, policy, res_id,
3288                                           dlmflags, cookie);
3289                         if (unlikely(rc != 0))
3290                                 GOTO(out_unlock, rc);
3291                 }
3292
3293                 /*
3294                  * Finish res_id initializing by name hash marking part of
3295                  * directory which is taking modification.
3296                  */
3297                 res_id->name[LUSTRE_RES_ID_HSH_OFF] = lh->mlh_pdo_hash;
3298         }
3299
3300         policy->l_inodebits.bits = *ibits;
3301         policy->l_inodebits.try_bits = trybits;
3302
3303         /*
3304          * Use LDLM_FL_LOCAL_ONLY for this lock. We do not know yet if it is
3305          * going to be sent to client. If it is - mdt_intent_policy() path will
3306          * fix it up and turn FL_LOCAL flag off.
3307          */
3308         rc = mdt_fid_lock(info->mti_env, ns, &lh->mlh_reg_lh, lh->mlh_reg_mode,
3309                           policy, res_id, LDLM_FL_LOCAL_ONLY | dlmflags,
3310                           cookie);
3311 out_unlock:
3312         if (rc != 0)
3313                 mdt_object_unlock(info, o, lh, 1);
3314         else if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_MDS_PDO_LOCK)) &&
3315                    lh->mlh_pdo_hash != 0 &&
3316                    (lh->mlh_reg_mode == LCK_PW || lh->mlh_reg_mode == LCK_EX))
3317                 OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_PDO_LOCK, 15);
3318
3319         /* Return successfully acquired bits to a caller */
3320         if (rc == 0) {
3321                 struct ldlm_lock *lock = ldlm_handle2lock(&lh->mlh_reg_lh);
3322
3323                 LASSERT(lock);
3324                 *ibits = lock->l_policy_data.l_inodebits.bits;
3325                 LDLM_LOCK_PUT(lock);
3326         }
3327         RETURN(rc);
3328 }
3329
3330 static int
3331 mdt_object_lock_internal(struct mdt_thread_info *info, struct mdt_object *o,
3332                          struct mdt_lock_handle *lh, __u64 *ibits,
3333                          __u64 trybits, bool cos_incompat)
3334 {
3335         struct mdt_lock_handle *local_lh = NULL;
3336         int rc;
3337         ENTRY;
3338
3339         if (!mdt_object_remote(o)) {
3340                 rc = mdt_object_local_lock(info, o, lh, ibits, trybits,
3341                                            cos_incompat);
3342                 RETURN(rc);
3343         }
3344
3345         /* XXX do not support PERM/LAYOUT/XATTR lock for remote object yet */
3346         *ibits &= ~(MDS_INODELOCK_PERM | MDS_INODELOCK_LAYOUT |
3347                     MDS_INODELOCK_XATTR);
3348
3349         /* Only enqueue LOOKUP lock for remote object */
3350         if (*ibits & MDS_INODELOCK_LOOKUP) {
3351                 __u64 local = MDS_INODELOCK_LOOKUP;
3352
3353                 rc = mdt_object_local_lock(info, o, lh, &local, 0,
3354                                            cos_incompat);
3355                 if (rc != ELDLM_OK)
3356                         RETURN(rc);
3357
3358                 local_lh = lh;
3359         }
3360
3361         if ((*ibits | trybits) & MDS_INODELOCK_UPDATE) {
3362                 /* Sigh, PDO needs to enqueue 2 locks right now, but
3363                  * enqueue RPC can only request 1 lock, to avoid extra
3364                  * RPC, so it will instead enqueue EX lock for remote
3365                  * object anyway XXX*/
3366                 if (lh->mlh_type == MDT_PDO_LOCK &&
3367                     lh->mlh_pdo_hash != 0) {
3368                         CDEBUG(D_INFO, "%s: "DFID" convert PDO lock to"
3369                                "EX lock.\n", mdt_obd_name(info->mti_mdt),
3370                                PFID(mdt_object_fid(o)));
3371                         lh->mlh_pdo_hash = 0;
3372                         lh->mlh_rreg_mode = LCK_EX;
3373                         lh->mlh_type = MDT_REG_LOCK;
3374                 }
3375
3376                 rc = mdt_remote_object_lock_try(info, o, mdt_object_fid(o),
3377                                                 &lh->mlh_rreg_lh,
3378                                                 lh->mlh_rreg_mode,
3379                                                 ibits, trybits, false);
3380                 if (rc != ELDLM_OK) {
3381                         if (local_lh != NULL)
3382                                 mdt_object_unlock(info, o, local_lh, rc);
3383                         RETURN(rc);
3384                 }
3385         }
3386
3387         /* other components like LFSCK can use lockless access
3388          * and populate cache, so we better invalidate it */
3389         mo_invalidate(info->mti_env, mdt_object_child(o));
3390
3391         RETURN(0);
3392 }
3393
3394 int mdt_object_lock(struct mdt_thread_info *info, struct mdt_object *o,
3395                     struct mdt_lock_handle *lh, __u64 ibits)
3396 {
3397         return mdt_object_lock_internal(info, o, lh, &ibits, 0, false);
3398 }
3399
3400 int mdt_reint_object_lock(struct mdt_thread_info *info, struct mdt_object *o,
3401                           struct mdt_lock_handle *lh, __u64 ibits,
3402                           bool cos_incompat)
3403 {
3404         LASSERT(lh->mlh_reg_mode == LCK_PW || lh->mlh_reg_mode == LCK_EX);
3405         return mdt_object_lock_internal(info, o, lh, &ibits, 0,
3406                                         cos_incompat);
3407 }
3408
3409 int mdt_object_lock_try(struct mdt_thread_info *info, struct mdt_object *o,
3410                         struct mdt_lock_handle *lh, __u64 *ibits,
3411                         __u64 trybits, bool cos_incompat)
3412 {
3413         bool trylock_only = *ibits == 0;
3414         int rc;
3415
3416         LASSERT(!(*ibits & trybits));
3417         rc = mdt_object_lock_internal(info, o, lh, ibits, trybits,
3418                                       cos_incompat);
3419         if (rc && trylock_only) { /* clear error for try ibits lock only */
3420                 LASSERT(*ibits == 0);
3421                 rc = 0;
3422         }
3423         return rc;
3424 }
3425
3426 /**
3427  * Save a lock within request object.
3428  *
3429  * Keep the lock referenced until whether client ACK or transaction
3430  * commit happens or release the lock immediately depending on input
3431  * parameters. If COS is ON, a write lock is converted to COS lock
3432  * before saving.
3433  *
3434  * \param info thead info object
3435  * \param h lock handle
3436  * \param mode lock mode
3437  * \param decref force immediate lock releasing
3438  */
3439 void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h,
3440                    enum ldlm_mode mode, int decref)
3441 {
3442         ENTRY;
3443
3444         if (lustre_handle_is_used(h)) {
3445                 if (decref || !info->mti_has_trans ||
3446                     !(mode & (LCK_PW | LCK_EX))) {
3447                         mdt_fid_unlock(h, mode);
3448                 } else {
3449                         struct mdt_device *mdt = info->mti_mdt;
3450                         struct ldlm_lock *lock = ldlm_handle2lock(h);
3451                         struct ptlrpc_request *req = mdt_info_req(info);
3452                         bool cos = mdt_cos_is_enabled(mdt);
3453                         bool convert_lock = !cos && mdt_slc_is_enabled(mdt);
3454
3455                         LASSERTF(lock != NULL, "no lock for cookie %#llx\n",
3456                                  h->cookie);
3457
3458                         /* there is no request if mdt_object_unlock() is called
3459                          * from mdt_export_cleanup()->mdt_add_dirty_flag() */
3460                         if (likely(req != NULL)) {
3461                                 LDLM_DEBUG(lock, "save lock request %p reply "
3462                                         "state %p transno %lld\n", req,
3463                                         req->rq_reply_state, req->rq_transno);
3464                                 if (cos) {
3465                                         ldlm_lock_mode_downgrade(lock, LCK_COS);
3466                                         mode = LCK_COS;
3467                                 }
3468                                 if (req->rq_export->exp_disconnected)
3469                                         mdt_fid_unlock(h, mode);
3470                                 else
3471                                         ptlrpc_save_lock(req, h, mode, cos,
3472                                                          convert_lock);
3473                         } else {
3474                                 mdt_fid_unlock(h, mode);
3475                         }
3476                         if (mdt_is_lock_sync(lock)) {
3477                                 CDEBUG(D_HA, "found sync-lock,"
3478                                        " async commit started\n");
3479                                 mdt_device_commit_async(info->mti_env,
3480                                                         mdt);
3481                         }
3482                         LDLM_LOCK_PUT(lock);
3483                 }
3484                 h->cookie = 0ull;
3485         }
3486
3487         EXIT;
3488 }
3489
3490 /**
3491  * Save cross-MDT lock in uncommitted_slc_locks
3492  *
3493  * Keep the lock referenced until transaction commit happens or release the lock
3494  * immediately depending on input parameters.
3495  *
3496  * \param info thead info object
3497  * \param h lock handle
3498  * \param mode lock mode
3499  * \param decref force immediate lock releasing
3500  */
3501 static void mdt_save_remote_lock(struct mdt_thread_info *info,
3502                                  struct mdt_object *o, struct lustre_handle *h,
3503                                  enum ldlm_mode mode, int decref)
3504 {
3505         ENTRY;
3506
3507         if (lustre_handle_is_used(h)) {
3508                 struct ldlm_lock *lock = ldlm_handle2lock(h);
3509
3510                 if (o != NULL &&
3511                     (lock->l_policy_data.l_inodebits.bits &
3512                      (MDS_INODELOCK_XATTR | MDS_INODELOCK_UPDATE)))
3513                         mo_invalidate(info->mti_env, mdt_object_child(o));
3514
3515                 if (decref || !info->mti_has_trans ||
3516                     !(mode & (LCK_PW | LCK_EX))) {
3517                         ldlm_lock_decref_and_cancel(h, mode);
3518                         LDLM_LOCK_PUT(lock);
3519                 } else {
3520                         struct ptlrpc_request *req = mdt_info_req(info);
3521
3522                         LASSERT(req != NULL);
3523                         tgt_save_slc_lock(&info->mti_mdt->mdt_lut, lock,
3524                                           req->rq_transno);
3525                         ldlm_lock_decref(h, mode);
3526                 }
3527                 h->cookie = 0ull;
3528         }
3529
3530         EXIT;
3531 }
3532
3533 /**
3534  * Unlock mdt object.
3535  *
3536  * Immeditely release the regular lock and the PDO lock or save the
3537  * lock in request and keep them referenced until client ACK or
3538  * transaction commit.
3539  *
3540  * \param info thread info object
3541  * \param o mdt object
3542  * \param lh mdt lock handle referencing regular and PDO locks
3543  * \param decref force immediate lock releasing
3544  *
3545  * XXX o is not used and may be NULL, see hsm_cdt_request_completed().
3546  */
3547 void mdt_object_unlock(struct mdt_thread_info *info, struct mdt_object *o,
3548                        struct mdt_lock_handle *lh, int decref)
3549 {
3550         ENTRY;
3551
3552         mdt_save_lock(info, &lh->mlh_pdo_lh, lh->mlh_pdo_mode, decref);
3553         mdt_save_lock(info, &lh->mlh_reg_lh, lh->mlh_reg_mode, decref);
3554         mdt_save_remote_lock(info, o, &lh->mlh_rreg_lh, lh->mlh_rreg_mode,
3555                              decref);
3556
3557         EXIT;
3558 }
3559
3560 struct mdt_object *mdt_object_find_lock(struct mdt_thread_info *info,
3561                                         const struct lu_fid *f,
3562                                         struct mdt_lock_handle *lh,
3563                                         __u64 ibits)
3564 {
3565         struct mdt_object *o;
3566
3567         o = mdt_object_find(info->mti_env, info->mti_mdt, f);
3568         if (!IS_ERR(o)) {
3569                 int rc;
3570
3571                 rc = mdt_object_lock(info, o, lh, ibits);
3572                 if (rc != 0) {
3573                         mdt_object_put(info->mti_env, o);
3574                         o = ERR_PTR(rc);
3575                 }
3576         }
3577         return o;
3578 }
3579
3580 void mdt_object_unlock_put(struct mdt_thread_info * info,
3581                            struct mdt_object * o,
3582                            struct mdt_lock_handle *lh,
3583                            int decref)
3584 {
3585         mdt_object_unlock(info, o, lh, decref);
3586         mdt_object_put(info->mti_env, o);
3587 }
3588
3589 /*
3590  * Generic code handling requests that have struct mdt_body passed in:
3591  *
3592  *  - extract mdt_body from request and save it in @info, if present;
3593  *
3594  *  - create lu_object, corresponding to the fid in mdt_body, and save it in
3595  *  @info;
3596  *
3597  *  - if HABEO_CORPUS flag is set for this request type check whether object
3598  *  actually exists on storage (lu_object_exists()).
3599  *
3600  */
3601 static int mdt_body_unpack(struct mdt_thread_info *info,
3602                            enum tgt_handler_flags flags)
3603 {
3604         const struct mdt_body    *body;
3605         struct mdt_object        *obj;
3606         const struct lu_env      *env;
3607         struct req_capsule       *pill;
3608         int                       rc;
3609         ENTRY;
3610
3611         env = info->mti_env;
3612         pill = info->mti_pill;
3613
3614         body = info->mti_body = req_capsule_client_get(pill, &RMF_MDT_BODY);
3615         if (body == NULL)
3616                 RETURN(-EFAULT);
3617
3618         if (!(body->mbo_valid & OBD_MD_FLID))
3619                 RETURN(0);
3620
3621         if (!fid_is_sane(&body->mbo_fid1)) {
3622                 CERROR("Invalid fid: "DFID"\n", PFID(&body->mbo_fid1));
3623                 RETURN(-EINVAL);
3624         }
3625
3626         obj = mdt_object_find(env, info->mti_mdt, &body->mbo_fid1);
3627         if (!IS_ERR(obj)) {
3628                 if ((flags & HABEO_CORPUS) && !mdt_object_exists(obj)) {
3629                         mdt_object_put(env, obj);
3630                         rc = -ENOENT;
3631                 } else {
3632                         info->mti_object = obj;
3633                         rc = 0;
3634                 }
3635         } else
3636                 rc = PTR_ERR(obj);
3637
3638         RETURN(rc);
3639 }
3640
3641 static int mdt_unpack_req_pack_rep(struct mdt_thread_info *info,
3642                                    enum tgt_handler_flags flags)
3643 {
3644         struct req_capsule *pill = info->mti_pill;
3645         int rc;
3646         ENTRY;
3647
3648         if (req_capsule_has_field(pill, &RMF_MDT_BODY, RCL_CLIENT))
3649                 rc = mdt_body_unpack(info, flags);
3650         else
3651                 rc = 0;
3652
3653         if (rc == 0 && (flags & HABEO_REFERO)) {
3654                 /* Pack reply. */
3655                 if (req_capsule_has_field(pill, &RMF_MDT_MD, RCL_SERVER))
3656                         req_capsule_set_size(pill, &RMF_MDT_MD, RCL_SERVER,
3657                                              DEF_REP_MD_SIZE);
3658                 if (req_capsule_has_field(pill, &RMF_LOGCOOKIES, RCL_SERVER))
3659                         req_capsule_set_size(pill, &RMF_LOGCOOKIES,
3660                                              RCL_SERVER, 0);
3661
3662                 /* Set ACL reply buffer size as LUSTRE_POSIX_ACL_MAX_SIZE_OLD
3663                  * by default. If the target object has more ACL entries, then
3664                  * enlarge the buffer when necessary. */
3665                 if (req_capsule_has_field(pill, &RMF_ACL, RCL_SERVER))
3666                         req_capsule_set_size(pill, &RMF_ACL, RCL_SERVER,
3667                                              LUSTRE_POSIX_ACL_MAX_SIZE_OLD);
3668
3669                 mdt_preset_secctx_size(info);
3670
3671                 rc = req_capsule_server_pack(pill);
3672                 if (rc)
3673                         CWARN("%s: cannot pack response: rc = %d\n",
3674                                       mdt_obd_name(info->mti_mdt), rc);
3675         }
3676         RETURN(rc);
3677 }
3678
3679 void mdt_lock_handle_init(struct mdt_lock_handle *lh)
3680 {
3681         lh->mlh_type = MDT_NUL_LOCK;
3682         lh->mlh_reg_lh.cookie = 0ull;
3683         lh->mlh_reg_mode = LCK_MINMODE;
3684         lh->mlh_pdo_lh.cookie = 0ull;
3685         lh->mlh_pdo_mode = LCK_MINMODE;
3686         lh->mlh_rreg_lh.cookie = 0ull;
3687         lh->mlh_rreg_mode = LCK_MINMODE;
3688 }
3689
3690 void mdt_lock_handle_fini(struct mdt_lock_handle *lh)
3691 {
3692         LASSERT(!lustre_handle_is_used(&lh->mlh_reg_lh));
3693         LASSERT(!lustre_handle_is_used(&lh->mlh_pdo_lh));
3694 }
3695
3696 /*
3697  * Initialize fields of struct mdt_thread_info. Other fields are left in
3698  * uninitialized state, because it's too expensive to zero out whole
3699  * mdt_thread_info (> 1K) on each request arrival.
3700  */
3701 void mdt_thread_info_init(struct ptlrpc_request *req,
3702                           struct mdt_thread_info *info)
3703 {
3704         int i;
3705
3706         info->mti_pill = &req->rq_pill;
3707
3708         /* lock handle */
3709         for (i = 0; i < ARRAY_SIZE(info->mti_lh); i++)
3710                 mdt_lock_handle_init(&info->mti_lh[i]);
3711
3712         /* mdt device: it can be NULL while CONNECT */
3713         if (req->rq_export) {
3714                 info->mti_mdt = mdt_dev(req->rq_export->exp_obd->obd_lu_dev);
3715                 info->mti_exp = req->rq_export;
3716         } else
3717                 info->mti_mdt = NULL;
3718         info->mti_env = req->rq_svc_thread->t_env;
3719         info->mti_transno = lustre_msg_get_transno(req->rq_reqmsg);
3720
3721         memset(&info->mti_attr, 0, sizeof(info->mti_attr));
3722         info->mti_big_buf = LU_BUF_NULL;
3723         info->mti_body = NULL;
3724         info->mti_object = NULL;
3725         info->mti_dlm_req = NULL;
3726         info->mti_has_trans = 0;
3727         info->mti_cross_ref = 0;
3728         info->mti_opdata = 0;
3729         info->mti_big_lmm_used = 0;
3730         info->mti_big_acl_used = 0;
3731         info->mti_som_valid = 0;
3732
3733         info->mti_spec.no_create = 0;
3734         info->mti_spec.sp_rm_entry = 0;
3735         info->mti_spec.sp_permitted = 0;
3736         info->mti_spec.sp_migrate_close = 0;
3737
3738         info->mti_spec.u.sp_ea.eadata = NULL;
3739         info->mti_spec.u.sp_ea.eadatalen = 0;
3740 }
3741
3742 void mdt_thread_info_fini(struct mdt_thread_info *info)
3743 {
3744         int i;
3745
3746         if (info->mti_object != NULL) {
3747                 mdt_object_put(info->mti_env, info->mti_object);
3748                 info->mti_object = NULL;
3749         }
3750
3751         for (i = 0; i < ARRAY_SIZE(info->mti_lh); i++)
3752                 mdt_lock_handle_fini(&info->mti_lh[i]);
3753         info->mti_env = NULL;
3754         info->mti_pill = NULL;
3755         info->mti_exp = NULL;
3756
3757         if (unlikely(info->mti_big_buf.lb_buf != NULL))
3758                 lu_buf_free(&info->mti_big_buf);
3759 }
3760
3761 struct mdt_thread_info *tsi2mdt_info(struct tgt_session_info *tsi)
3762 {
3763         struct mdt_thread_info  *mti;
3764
3765         mti = mdt_th_info(tsi->tsi_env);
3766         LASSERT(mti != NULL);
3767
3768         mdt_thread_info_init(tgt_ses_req(tsi), mti);
3769         if (tsi->tsi_corpus != NULL) {
3770                 mti->mti_object = mdt_obj(tsi->tsi_corpus);
3771                 lu_object_get(tsi->tsi_corpus);
3772         }
3773         mti->mti_body = tsi->tsi_mdt_body;
3774         mti->mti_dlm_req = tsi->tsi_dlm_req;
3775
3776         return mti;
3777 }
3778
3779 static int mdt_tgt_connect(struct tgt_session_info *tsi)
3780 {
3781         if (OBD_FAIL_CHECK(OBD_FAIL_TGT_DELAY_CONDITIONAL) &&
3782             cfs_fail_val ==
3783             tsi2mdt_info(tsi)->mti_mdt->mdt_seq_site.ss_node_id) {
3784                 set_current_state(TASK_UNINTERRUPTIBLE);
3785                 schedule_timeout(msecs_to_jiffies(3 * MSEC_PER_SEC));
3786         }
3787
3788         return tgt_connect(tsi);
3789 }
3790
3791 static int mdt_intent_glimpse(enum ldlm_intent_flags it_opc,
3792                               struct mdt_thread_info *info,
3793                               struct ldlm_lock **lockp, __u64 flags)
3794 {
3795         return mdt_glimpse_enqueue(info, info->mti_mdt->mdt_namespace,
3796                                    lockp, flags);
3797 }
3798 static int mdt_intent_brw(enum ldlm_intent_flags it_opc,
3799                           struct mdt_thread_info *info,
3800                           struct ldlm_lock **lockp, __u64 flags)
3801 {
3802         return mdt_brw_enqueue(info, info->mti_mdt->mdt_namespace,
3803                                lockp, flags);
3804 }
3805
3806 int mdt_intent_lock_replace(struct mdt_thread_info *info,
3807                             struct ldlm_lock **lockp,
3808                             struct mdt_lock_handle *lh,
3809                             __u64 flags, int result)
3810 {
3811         struct ptlrpc_request  *req = mdt_info_req(info);
3812         struct ldlm_lock       *lock = *lockp;
3813         struct ldlm_lock       *new_lock;
3814
3815         /* If possible resent found a lock, @lh is set to its handle */
3816         new_lock = ldlm_handle2lock_long(&lh->mlh_reg_lh, 0);
3817
3818         if (new_lock == NULL && (flags & LDLM_FL_INTENT_ONLY)) {
3819                 lh->mlh_reg_lh.cookie = 0;
3820                 RETURN(0);
3821         }
3822
3823         if (new_lock == NULL && (flags & LDLM_FL_RESENT)) {
3824                 /* Lock is pinned by ldlm_handle_enqueue0() as it is
3825                  * a resend case, however, it could be already destroyed
3826                  * due to client eviction or a raced cancel RPC. */
3827                 LDLM_DEBUG_NOLOCK("Invalid lock handle %#llx\n",
3828                                   lh->mlh_reg_lh.cookie);
3829                 lh->mlh_reg_lh.cookie = 0;
3830                 RETURN(-ESTALE);
3831         }
3832
3833         LASSERTF(new_lock != NULL,
3834                  "lockh %#llx flags %#llx : rc = %d\n",
3835                  lh->mlh_reg_lh.cookie, flags, result);
3836
3837         /*
3838          * If we've already given this lock to a client once, then we should
3839          * have no readers or writers.  Otherwise, we should have one reader
3840          * _or_ writer ref (which will be zeroed below) before returning the
3841          * lock to a client.
3842          */
3843         if (new_lock->l_export == req->rq_export) {
3844                 LASSERT(new_lock->l_readers + new_lock->l_writers == 0);
3845         } else {
3846                 LASSERT(new_lock->l_export == NULL);
3847                 LASSERT(new_lock->l_readers + new_lock->l_writers == 1);
3848         }
3849
3850         *lockp = new_lock;
3851
3852         if (new_lock->l_export == req->rq_export) {
3853                 /*
3854                  * Already gave this to the client, which means that we
3855                  * reconstructed a reply.
3856                  */
3857                 LASSERT(lustre_msg_get_flags(req->rq_reqmsg) &
3858                         MSG_RESENT);
3859
3860                 LDLM_LOCK_RELEASE(new_lock);
3861                 lh->mlh_reg_lh.cookie = 0;
3862                 RETURN(ELDLM_LOCK_REPLACED);
3863         }
3864
3865         /*
3866          * Fixup the lock to be given to the client.
3867          */
3868         lock_res_and_lock(new_lock);
3869         /* Zero new_lock->l_readers and new_lock->l_writers without triggering
3870          * possible blocking AST. */
3871         while (new_lock->l_readers > 0) {
3872                 lu_ref_del(&new_lock->l_reference, "reader", new_lock);
3873                 lu_ref_del(&new_lock->l_reference, "user", new_lock);
3874                 new_lock->l_readers--;
3875         }
3876         while (new_lock->l_writers > 0) {
3877                 lu_ref_del(&new_lock->l_reference, "writer", new_lock);
3878                 lu_ref_del(&new_lock->l_reference, "user", new_lock);
3879                 new_lock->l_writers--;
3880         }
3881
3882         new_lock->l_export = class_export_lock_get(req->rq_export, new_lock);
3883         new_lock->l_blocking_ast = lock->l_blocking_ast;
3884         new_lock->l_completion_ast = lock->l_completion_ast;
3885         if (ldlm_has_dom(new_lock))
3886                 new_lock->l_glimpse_ast = ldlm_server_glimpse_ast;
3887         new_lock->l_remote_handle = lock->l_remote_handle;
3888         new_lock->l_flags &= ~LDLM_FL_LOCAL;
3889
3890         unlock_res_and_lock(new_lock);
3891
3892         cfs_hash_add(new_lock->l_export->exp_lock_hash,
3893                      &new_lock->l_remote_handle,
3894                      &new_lock->l_exp_hash);
3895
3896         LDLM_LOCK_RELEASE(new_lock);
3897         lh->mlh_reg_lh.cookie = 0;
3898
3899         RETURN(ELDLM_LOCK_REPLACED);
3900 }
3901
3902 void mdt_intent_fixup_resent(struct mdt_thread_info *info,
3903                              struct ldlm_lock *new_lock,
3904                              struct mdt_lock_handle *lh, __u64 flags)
3905 {
3906         struct ptlrpc_request  *req = mdt_info_req(info);
3907         struct ldlm_request    *dlmreq;
3908
3909         if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))
3910                 return;
3911
3912         dlmreq = req_capsule_client_get(info->mti_pill, &RMF_DLM_REQ);
3913
3914         /* Check if this is a resend case (MSG_RESENT is set on RPC) and a
3915          * lock was found by ldlm_handle_enqueue(); if so @lh must be
3916          * initialized. */
3917         if (flags & LDLM_FL_RESENT) {
3918                 lh->mlh_reg_lh.cookie = new_lock->l_handle.h_cookie;
3919                 lh->mlh_reg_mode = new_lock->l_granted_mode;
3920
3921                 LDLM_DEBUG(new_lock, "Restoring lock cookie");
3922                 DEBUG_REQ(D_DLMTRACE, req, "restoring lock cookie %#llx",
3923                           lh->mlh_reg_lh.cookie);
3924                 return;
3925         }
3926
3927         /*
3928          * If the xid matches, then we know this is a resent request, and allow
3929          * it. (It's probably an OPEN, for which we don't send a lock.
3930          */
3931         if (req_can_reconstruct(req, NULL))
3932                 return;
3933
3934         /*
3935          * This remote handle isn't enqueued, so we never received or processed
3936          * this request.  Clear MSG_RESENT, because it can be handled like any
3937          * normal request now.
3938          */
3939         lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
3940
3941         DEBUG_REQ(D_DLMTRACE, req, "no existing lock with rhandle %#llx",
3942                   dlmreq->lock_handle[0].cookie);
3943 }
3944
3945 static int mdt_intent_getxattr(enum ldlm_intent_flags it_opc,
3946                                struct mdt_thread_info *info,
3947                                struct ldlm_lock **lockp,
3948                                __u64 flags)
3949 {
3950         struct mdt_lock_handle *lhc = &info->mti_lh[MDT_LH_RMT];
3951         struct ldlm_reply      *ldlm_rep = NULL;
3952         int rc;
3953         ENTRY;
3954
3955         /*
3956          * Initialize lhc->mlh_reg_lh either from a previously granted lock
3957          * (for the resend case) or a new lock. Below we will use it to
3958          * replace the original lock.
3959          */
3960         mdt_intent_fixup_resent(info, *lockp, lhc, flags);
3961         if (!lustre_handle_is_used(&lhc->mlh_reg_lh)) {
3962                 mdt_lock_reg_init(lhc, (*lockp)->l_req_mode);
3963                 rc = mdt_object_lock(info, info->mti_object, lhc,
3964                                      MDS_INODELOCK_XATTR);
3965                 if (rc)
3966                         return rc;
3967         }
3968
3969         rc = mdt_getxattr(info);
3970
3971         if (mdt_info_req(info)->rq_repmsg != NULL)
3972                 ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
3973
3974         if (ldlm_rep == NULL ||
3975             OBD_FAIL_CHECK(OBD_FAIL_MDS_XATTR_REP)) {
3976                 mdt_object_unlock(info,  info->mti_object, lhc, 1);
3977                 if (is_serious(rc))
3978                         RETURN(rc);
3979                 else
3980                         RETURN(err_serious(-EFAULT));
3981         }
3982
3983         ldlm_rep->lock_policy_res2 = clear_serious(rc);
3984
3985         /* This is left for interop instead of adding a new interop flag.
3986          * LU-7433 */
3987 #if LUSTRE_VERSION_CODE > OBD_OCD_VERSION(3, 0, 0, 0)
3988         if (ldlm_rep->lock_policy_res2) {
3989                 mdt_object_unlock(info, info->mti_object, lhc, 1);
3990                 RETURN(ELDLM_LOCK_ABORTED);
3991         }
3992 #endif
3993
3994         rc = mdt_intent_lock_replace(info, lockp, lhc, flags, rc);
3995         RETURN(rc);
3996 }
3997
3998 static int mdt_intent_getattr(enum ldlm_intent_flags it_opc,
3999                               struct mdt_thread_info *info,
4000                               struct ldlm_lock **lockp,
4001                               __u64 flags)
4002 {
4003         struct mdt_lock_handle *lhc = &info->mti_lh[MDT_LH_RMT];
4004         __u64                   child_bits;
4005         struct ldlm_reply      *ldlm_rep;
4006         struct mdt_body        *reqbody;
4007         struct mdt_body        *repbody;
4008         int                     rc, rc2;
4009         ENTRY;
4010
4011         reqbody = req_capsule_client_get(info->mti_pill, &RMF_MDT_BODY);
4012         LASSERT(reqbody);
4013
4014         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
4015         LASSERT(repbody);
4016
4017         info->mti_cross_ref = !!(reqbody->mbo_valid & OBD_MD_FLCROSSREF);
4018         repbody->mbo_eadatasize = 0;
4019         repbody->mbo_aclsize = 0;
4020
4021         switch (it_opc) {
4022         case IT_LOOKUP:
4023                 child_bits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM;
4024                 break;
4025         case IT_GETATTR:
4026                 child_bits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
4027                              MDS_INODELOCK_PERM;
4028                 break;
4029         default:
4030                 CERROR("%s: unsupported intent %#x\n",
4031                        mdt_obd_name(info->mti_mdt), (unsigned int)it_opc);
4032                 GOTO(out_shrink, rc = -EINVAL);
4033         }
4034
4035         rc = mdt_init_ucred_intent_getattr(info, reqbody);
4036         if (rc)
4037                 GOTO(out_shrink, rc);
4038
4039         ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
4040         mdt_set_disposition(info, ldlm_rep, DISP_IT_EXECD);
4041
4042         /* Get lock from request for possible resent case. */
4043         mdt_intent_fixup_resent(info, *lockp, lhc, flags);
4044
4045         rc = mdt_getattr_name_lock(info, lhc, child_bits, ldlm_rep);
4046         ldlm_rep->lock_policy_res2 = clear_serious(rc);
4047
4048         if (mdt_get_disposition(ldlm_rep, DISP_LOOKUP_NEG))
4049                 ldlm_rep->lock_policy_res2 = 0;
4050         if (!mdt_get_disposition(ldlm_rep, DISP_LOOKUP_POS) ||
4051             ldlm_rep->lock_policy_res2) {
4052                 lhc->mlh_reg_lh.cookie = 0ull;
4053                 GOTO(out_ucred, rc = ELDLM_LOCK_ABORTED);
4054         }
4055
4056         rc = mdt_intent_lock_replace(info, lockp, lhc, flags, rc);
4057         EXIT;
4058 out_ucred:
4059         mdt_exit_ucred(info);
4060 out_shrink:
4061         mdt_client_compatibility(info);
4062         rc2 = mdt_fix_reply(info);
4063         if (rc == 0)
4064                 rc = rc2;
4065         return rc;
4066 }
4067
4068 static int mdt_intent_layout(enum ldlm_intent_flags it_opc,
4069                              struct mdt_thread_info *info,
4070                              struct ldlm_lock **lockp,
4071                              __u64 flags)
4072 {
4073         struct mdt_lock_handle *lhc = &info->mti_lh[MDT_LH_RMT];
4074         struct md_layout_change layout = { .mlc_opc = MD_LAYOUT_NOP };
4075         struct layout_intent *intent;
4076         struct ldlm_reply *ldlm_rep;
4077         struct lu_fid *fid = &info->mti_tmp_fid2;
4078         struct mdt_object *obj = NULL;
4079         int layout_size = 0;
4080         struct lu_buf *buf = &layout.mlc_buf;
4081         int rc = 0;
4082
4083         ENTRY;
4084
4085         fid_extract_from_res_name(fid, &(*lockp)->l_resource->lr_name);
4086
4087         intent = req_capsule_client_get(info->mti_pill, &RMF_LAYOUT_INTENT);
4088         if (intent == NULL)
4089                 RETURN(-EPROTO);
4090
4091         CDEBUG(D_INFO, DFID "got layout change request from client: "
4092                "opc:%u flags:%#x extent "DEXT"\n",
4093                PFID(fid), intent->li_opc, intent->li_flags,
4094                PEXT(&intent->li_extent));
4095
4096         switch (intent->li_opc) {
4097         case LAYOUT_INTENT_TRUNC:
4098         case LAYOUT_INTENT_WRITE:
4099                 layout.mlc_opc = MD_LAYOUT_WRITE;
4100                 layout.mlc_intent = intent;
4101                 break;
4102         case LAYOUT_INTENT_ACCESS:
4103                 break;
4104         case LAYOUT_INTENT_READ:
4105         case LAYOUT_INTENT_GLIMPSE:
4106         case LAYOUT_INTENT_RELEASE:
4107         case LAYOUT_INTENT_RESTORE:
4108                 CERROR("%s: Unsupported layout intent opc %d\n",
4109                        mdt_obd_name(info->mti_mdt), intent->li_opc);
4110                 RETURN(-ENOTSUPP);
4111         default:
4112                 CERROR("%s: Unknown layout intent opc %d\n",
4113                        mdt_obd_name(info->mti_mdt), intent->li_opc);
4114                 RETURN(-EINVAL);
4115         }
4116
4117         obj = mdt_object_find(info->mti_env, info->mti_mdt, fid);
4118         if (IS_ERR(obj))
4119                 RETURN(PTR_ERR(obj));
4120
4121         if (mdt_object_exists(obj) && !mdt_object_remote(obj)) {
4122                 /* if layout is going to be changed don't use the current EA
4123                  * size but the maximum one. That buffer will be shrinked
4124                  * to the actual size in req_capsule_shrink() before reply.
4125                  */
4126                 if (layout.mlc_opc == MD_LAYOUT_WRITE) {
4127                         layout_size = info->mti_mdt->mdt_max_mdsize;
4128                 } else {
4129                         layout_size = mdt_attr_get_eabuf_size(info, obj);
4130                         if (layout_size < 0)
4131                                 GOTO(out, rc = layout_size);
4132
4133                         if (layout_size > info->mti_mdt->mdt_max_mdsize)
4134                                 info->mti_mdt->mdt_max_mdsize = layout_size;
4135                 }
4136                 CDEBUG(D_INFO, "%s: layout_size %d\n",
4137                        mdt_obd_name(info->mti_mdt), layout_size);
4138         }
4139
4140         /*
4141          * set reply buffer size, so that ldlm_handle_enqueue0()->
4142          * ldlm_lvbo_fill() will fill the reply buffer with lovea.
4143          */
4144         req_capsule_set_size(info->mti_pill, &RMF_DLM_LVB, RCL_SERVER,
4145                              layout_size);
4146         rc = req_capsule_server_pack(info->mti_pill);
4147         if (rc)
4148                 GOTO(out, rc);
4149
4150         ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
4151         if (!ldlm_rep)
4152                 GOTO(out, rc = -EPROTO);
4153
4154         mdt_set_disposition(info, ldlm_rep, DISP_IT_EXECD);
4155
4156         /* take lock in ldlm_lock_enqueue() for LAYOUT_INTENT_ACCESS */
4157         if (layout.mlc_opc == MD_LAYOUT_NOP)
4158                 GOTO(out, rc = 0);
4159
4160         rc = mdt_check_resent(info, mdt_reconstruct_generic, lhc);
4161         if (rc < 0)
4162                 GOTO(out, rc);
4163         if (rc == 1) {
4164                 DEBUG_REQ(D_INODE, mdt_info_req(info), "resent opt.");
4165                 rc = lustre_msg_get_status(mdt_info_req(info)->rq_repmsg);
4166                 GOTO(out, rc);
4167         }
4168
4169         buf->lb_buf = NULL;
4170         buf->lb_len = 0;
4171         if (unlikely(req_is_replay(mdt_info_req(info)))) {
4172                 buf->lb_buf = req_capsule_client_get(info->mti_pill,
4173                                                      &RMF_EADATA);
4174                 buf->lb_len = req_capsule_get_size(info->mti_pill,
4175                                                      &RMF_EADATA, RCL_CLIENT);
4176                 /*
4177                  * If it's a replay of layout write intent RPC, the client has
4178                  * saved the extended lovea when it get reply then.
4179                  */
4180                 if (buf->lb_len > 0)
4181                         mdt_fix_lov_magic(info, buf->lb_buf);
4182         }
4183
4184         /* Get lock from request for possible resent case. */
4185         mdt_intent_fixup_resent(info, *lockp, lhc, flags);
4186         (*lockp)->l_lvb_type = LVB_T_LAYOUT;
4187
4188         /*
4189          * Instantiate some layout components, if @buf contains lovea, then it's
4190          * a replay of the layout intent write RPC.
4191          */
4192         rc = mdt_layout_change(info, obj, lhc, &layout);
4193         ldlm_rep->lock_policy_res2 = clear_serious(rc);
4194
4195         if (lustre_handle_is_used(&lhc->mlh_reg_lh)) {
4196                 rc = mdt_intent_lock_replace(info, lockp, lhc, flags, rc);
4197                 if (rc == ELDLM_LOCK_REPLACED &&
4198                     (*lockp)->l_granted_mode == LCK_EX)
4199                         ldlm_lock_mode_downgrade(*lockp, LCK_CR);
4200         }
4201
4202         EXIT;
4203 out:
4204         mdt_object_put(info->mti_env, obj);
4205         return rc;
4206 }
4207
4208 static int mdt_intent_open(enum ldlm_intent_flags it_opc,
4209                            struct mdt_thread_info *info,
4210                            struct ldlm_lock **lockp,
4211                            __u64 flags)
4212 {
4213         struct mdt_lock_handle *lhc = &info->mti_lh[MDT_LH_RMT];
4214         struct ldlm_reply      *rep = NULL;
4215         long                    opc;
4216         int                     rc;
4217
4218         static const struct req_format *intent_fmts[REINT_MAX] = {
4219                 [REINT_CREATE]  = &RQF_LDLM_INTENT_CREATE,
4220                 [REINT_OPEN]    = &RQF_LDLM_INTENT_OPEN
4221         };
4222
4223         ENTRY;
4224
4225         opc = mdt_reint_opcode(mdt_info_req(info), intent_fmts);
4226         if (opc < 0)
4227                 RETURN(opc);
4228
4229         /* Get lock from request for possible resent case. */
4230         mdt_intent_fixup_resent(info, *lockp, lhc, flags);
4231
4232         rc = mdt_reint_internal(info, lhc, opc);
4233
4234         /* Check whether the reply has been packed successfully. */
4235         if (mdt_info_req(info)->rq_repmsg != NULL)
4236                 rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
4237         if (rep == NULL) {
4238                 if (is_serious(rc))
4239                         RETURN(rc);
4240                 else
4241                         RETURN(err_serious(-EFAULT));
4242         }
4243
4244         /* MDC expects this in any case */
4245         if (rc != 0)
4246                 mdt_set_disposition(info, rep, DISP_LOOKUP_EXECD);
4247
4248         /* the open lock or the lock for cross-ref object should be
4249          * returned to the client */
4250         if (lustre_handle_is_used(&lhc->mlh_reg_lh) &&
4251             (rc == 0 || rc == -MDT_EREMOTE_OPEN)) {
4252                 rep->lock_policy_res2 = 0;
4253                 rc = mdt_intent_lock_replace(info, lockp, lhc, flags, rc);
4254                 RETURN(rc);
4255         }
4256
4257         rep->lock_policy_res2 = clear_serious(rc);
4258
4259         if (rep->lock_policy_res2 == -ENOENT &&
4260             mdt_get_disposition(rep, DISP_LOOKUP_NEG) &&
4261             !mdt_get_disposition(rep, DISP_OPEN_CREATE))
4262                 rep->lock_policy_res2 = 0;
4263
4264         lhc->mlh_reg_lh.cookie = 0ull;
4265         if (rc == -ENOTCONN || rc == -ENODEV ||
4266             rc == -EOVERFLOW) { /**< if VBR failure then return error */
4267                 /*
4268                  * If it is the disconnect error (ENODEV & ENOCONN), the error
4269                  * will be returned by rq_status, and client at ptlrpc layer
4270                  * will detect this, then disconnect, reconnect the import
4271                  * immediately, instead of impacting the following the rpc.
4272                  */
4273                 RETURN(rc);
4274         }
4275         /*
4276          * For other cases, the error will be returned by intent, and client
4277          * will retrieve the result from intent.
4278          */
4279         RETURN(ELDLM_LOCK_ABORTED);
4280 }
4281
4282 static int mdt_intent_opc(enum ldlm_intent_flags it_opc,
4283                           struct mdt_thread_info *info,
4284                           struct ldlm_lock **lockp,
4285                           u64 flags /* LDLM_FL_* */)
4286 {
4287         struct req_capsule *pill = info->mti_pill;
4288         struct ptlrpc_request *req = mdt_info_req(info);
4289         const struct req_format *it_format;
4290         int (*it_handler)(enum ldlm_intent_flags,
4291                           struct mdt_thread_info *,
4292                           struct ldlm_lock **,
4293                           u64);
4294         enum tgt_handler_flags it_handler_flags = 0;
4295         struct ldlm_reply *rep;
4296         bool check_mdt_object = false;
4297         int rc;
4298         ENTRY;
4299
4300         switch (it_opc) {
4301         case IT_OPEN:
4302         case IT_OPEN|IT_CREAT:
4303                 /*
4304                  * OCREAT is not a MUTABOR request since the file may
4305                  * already exist. We do the extra check of
4306                  * OBD_CONNECT_RDONLY in mdt_reint_open() when we
4307                  * really need to create the object.
4308                  */
4309                 it_format = &RQF_LDLM_INTENT;
4310                 it_handler = &mdt_intent_open;
4311                 break;
4312         case IT_GETATTR:
4313                 check_mdt_object = true;
4314         case IT_LOOKUP:
4315                 it_format = &RQF_LDLM_INTENT_GETATTR;
4316                 it_handler = &mdt_intent_getattr;
4317                 it_handler_flags = HABEO_REFERO;
4318                 break;
4319         case IT_GETXATTR:
4320                 check_mdt_object = true;
4321                 it_format = &RQF_LDLM_INTENT_GETXATTR;
4322                 it_handler = &mdt_intent_getxattr;
4323                 it_handler_flags = HABEO_CORPUS;
4324                 break;
4325         case IT_LAYOUT:
4326                 it_format = &RQF_LDLM_INTENT_LAYOUT;
4327                 it_handler = &mdt_intent_layout;
4328                 break;
4329         case IT_GLIMPSE:
4330                 it_format = &RQF_LDLM_INTENT;
4331                 it_handler = &mdt_intent_glimpse;
4332                 break;
4333         case IT_BRW:
4334                 it_format = &RQF_LDLM_INTENT;
4335                 it_handler = &mdt_intent_brw;
4336                 break;
4337         case IT_QUOTA_DQACQ:
4338         case IT_QUOTA_CONN: {
4339                 struct lu_device *qmt = info->mti_mdt->mdt_qmt_dev;
4340
4341                 if (qmt == NULL)
4342                         RETURN(-EOPNOTSUPP);
4343
4344                 if (mdt_rdonly(req->rq_export))
4345                         RETURN(-EROFS);
4346
4347                 (*lockp)->l_lvb_type = LVB_T_LQUOTA;
4348                 /* pass the request to quota master */
4349                 rc = qmt_hdls.qmth_intent_policy(info->mti_env, qmt,
4350                                                  mdt_info_req(info), lockp,
4351                                                  flags);
4352                 RETURN(rc);
4353         }
4354         default:
4355                 CERROR("%s: unknown intent code %#x\n",
4356                        mdt_obd_name(info->mti_mdt), it_opc);
4357                 RETURN(-EPROTO);
4358         }
4359
4360         req_capsule_extend(pill, it_format);
4361
4362         rc = mdt_unpack_req_pack_rep(info, it_handler_flags);
4363         if (rc < 0)
4364                 RETURN(rc);
4365
4366         if (unlikely(info->mti_object == NULL && check_mdt_object))
4367                 RETURN(-EPROTO);
4368
4369         if (it_handler_flags & MUTABOR && mdt_rdonly(req->rq_export))
4370                 RETURN(-EROFS);
4371
4372         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_INTENT_DELAY, 10);
4373
4374         /* execute policy */
4375         rc = (*it_handler)(it_opc, info, lockp, flags);
4376
4377         /* Check whether the reply has been packed successfully. */
4378         if (req->rq_repmsg != NULL) {
4379                 rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
4380                 rep->lock_policy_res2 =
4381                         ptlrpc_status_hton(rep->lock_policy_res2);
4382         }
4383
4384         RETURN(rc);
4385 }
4386
4387 static void mdt_ptlrpc_stats_update(struct ptlrpc_request *req,
4388                                     enum ldlm_intent_flags it_opc)
4389 {
4390         struct lprocfs_stats *srv_stats = ptlrpc_req2svc(req)->srv_stats;
4391
4392         /* update stats when IT code is known */
4393         if (srv_stats != NULL)
4394                 lprocfs_counter_incr(srv_stats,
4395                                 PTLRPC_LAST_CNTR + (it_opc == IT_GLIMPSE ?
4396                                 LDLM_GLIMPSE_ENQUEUE : LDLM_IBITS_ENQUEUE));
4397 }
4398
4399 static int mdt_intent_policy(const struct lu_env *env,
4400                              struct ldlm_namespace *ns,
4401                              struct ldlm_lock **lockp,
4402                              void *req_cookie,
4403                              enum ldlm_mode mode,
4404                              __u64 flags, void *data)
4405 {
4406         struct tgt_session_info *tsi;
4407         struct mdt_thread_info  *info;
4408         struct ptlrpc_request   *req  =  req_cookie;
4409         struct ldlm_intent      *it;
4410         struct req_capsule      *pill;
4411         const struct ldlm_lock_desc *ldesc;
4412         int rc;
4413
4414         ENTRY;
4415
4416         LASSERT(req != NULL);
4417
4418         tsi = tgt_ses_info(env);
4419
4420         info = tsi2mdt_info(tsi);
4421         LASSERT(info != NULL);
4422         pill = info->mti_pill;
4423         LASSERT(pill->rc_req == req);
4424         ldesc = &info->mti_dlm_req->lock_desc;
4425
4426         if (req->rq_reqmsg->lm_bufcount > DLM_INTENT_IT_OFF) {
4427                 req_capsule_extend(pill, &RQF_LDLM_INTENT_BASIC);
4428                 it = req_capsule_client_get(pill, &RMF_LDLM_INTENT);
4429                 if (it != NULL) {
4430                         mdt_ptlrpc_stats_update(req, it->opc);
4431                         rc = mdt_intent_opc(it->opc, info, lockp, flags);
4432                         if (rc == 0)
4433                                 rc = ELDLM_OK;
4434
4435                         /* Lock without inodebits makes no sense and will oops
4436                          * later in ldlm. Let's check it now to see if we have
4437                          * ibits corrupted somewhere in mdt_intent_opc().
4438                          * The case for client miss to set ibits has been
4439                          * processed by others. */
4440                         LASSERT(ergo(ldesc->l_resource.lr_type == LDLM_IBITS,
4441                                 ldesc->l_policy_data.l_inodebits.bits != 0));
4442                 } else {
4443                         rc = err_serious(-EFAULT);
4444                 }
4445         } else {
4446                 /* No intent was provided */
4447                 req_capsule_set_size(pill, &RMF_DLM_LVB, RCL_SERVER, 0);
4448                 rc = req_capsule_server_pack(pill);
4449                 if (rc)
4450                         rc = err_serious(rc);
4451         }
4452         mdt_thread_info_fini(info);
4453         RETURN(rc);
4454 }
4455
4456 static void mdt_deregister_seq_exp(struct mdt_device *mdt)
4457 {
4458         struct seq_server_site  *ss = mdt_seq_site(mdt);
4459
4460         if (ss->ss_node_id == 0)
4461                 return;
4462
4463         if (ss->ss_client_seq != NULL) {
4464                 lustre_deregister_lwp_item(&ss->ss_client_seq->lcs_exp);
4465                 ss->ss_client_seq->lcs_exp = NULL;
4466         }
4467
4468         if (ss->ss_server_fld != NULL) {
4469                 lustre_deregister_lwp_item(&ss->ss_server_fld->lsf_control_exp);
4470                 ss->ss_server_fld->lsf_control_exp = NULL;
4471         }
4472 }
4473
4474 static void mdt_seq_fini_cli(struct mdt_device *mdt)
4475 {
4476         struct seq_server_site *ss = mdt_seq_site(mdt);
4477
4478         if (ss == NULL)
4479                 return;
4480
4481         if (ss->ss_server_seq != NULL)
4482                 seq_server_set_cli(NULL, ss->ss_server_seq, NULL);
4483 }
4484
4485 static int mdt_seq_fini(const struct lu_env *env, struct mdt_device *mdt)
4486 {
4487         mdt_seq_fini_cli(mdt);
4488         mdt_deregister_seq_exp(mdt);
4489
4490         return seq_site_fini(env, mdt_seq_site(mdt));
4491 }
4492
4493 /**
4494  * It will retrieve its FLDB entries from MDT0, and it only happens
4495  * when upgrading existent FS to 2.6 or when local FLDB is corrupted,
4496  * and it needs to refresh FLDB from the MDT0.
4497  **/
4498 static int mdt_register_lwp_callback(void *data)
4499 {
4500         struct lu_env           env;
4501         struct mdt_device       *mdt = data;
4502         struct lu_server_fld    *fld = mdt_seq_site(mdt)->ss_server_fld;
4503         int                     rc;
4504         ENTRY;
4505
4506         LASSERT(mdt_seq_site(mdt)->ss_node_id != 0);
4507
4508         rc = lu_env_init(&env, LCT_MD_THREAD);
4509         if (rc < 0) {
4510                 CERROR("%s: cannot init env: rc = %d\n", mdt_obd_name(mdt), rc);
4511                 RETURN(rc);
4512         }
4513
4514         /* Allocate new sequence now to avoid creating local transaction
4515          * in the normal transaction process */
4516         rc = seq_server_check_and_alloc_super(&env,
4517                                               mdt_seq_site(mdt)->ss_server_seq);
4518         if (rc < 0)
4519                 GOTO(out, rc);
4520
4521         if (fld->lsf_new) {
4522                 rc = fld_update_from_controller(&env, fld);
4523                 if (rc != 0) {
4524                         CERROR("%s: cannot update controller: rc = %d\n",
4525                                mdt_obd_name(mdt), rc);
4526                         GOTO(out, rc);
4527                 }
4528         }
4529 out:
4530         lu_env_fini(&env);
4531         RETURN(rc);
4532 }
4533
4534 static int mdt_register_seq_exp(struct mdt_device *mdt)
4535 {
4536         struct seq_server_site  *ss = mdt_seq_site(mdt);
4537         char                    *lwp_name = NULL;
4538         int                     rc;
4539
4540         if (ss->ss_node_id == 0)
4541                 return 0;
4542
4543         OBD_ALLOC(lwp_name, MAX_OBD_NAME);
4544         if (lwp_name == NULL)
4545                 GOTO(out_free, rc = -ENOMEM);
4546
4547         rc = tgt_name2lwp_name(mdt_obd_name(mdt), lwp_name, MAX_OBD_NAME, 0);
4548         if (rc != 0)
4549                 GOTO(out_free, rc);
4550
4551         rc = lustre_register_lwp_item(lwp_name, &ss->ss_client_seq->lcs_exp,
4552                                       NULL, NULL);
4553         if (rc != 0)
4554                 GOTO(out_free, rc);
4555
4556         rc = lustre_register_lwp_item(lwp_name,
4557                                       &ss->ss_server_fld->lsf_control_exp,
4558                                       mdt_register_lwp_callback, mdt);
4559         if (rc != 0) {
4560                 lustre_deregister_lwp_item(&ss->ss_client_seq->lcs_exp);
4561                 ss->ss_client_seq->lcs_exp = NULL;
4562                 GOTO(out_free, rc);
4563         }
4564 out_free:
4565         if (lwp_name != NULL)
4566                 OBD_FREE(lwp_name, MAX_OBD_NAME);
4567
4568         return rc;
4569 }
4570
4571 /*
4572  * Init client sequence manager which is used by local MDS to talk to sequence
4573  * controller on remote node.
4574  */
4575 static int mdt_seq_init_cli(const struct lu_env *env, struct mdt_device *mdt)
4576 {
4577         struct seq_server_site  *ss = mdt_seq_site(mdt);
4578         int                     rc;
4579         char                    *prefix;
4580         ENTRY;
4581
4582         /* check if this is adding the first MDC and controller is not yet
4583          * initialized. */
4584         OBD_ALLOC_PTR(ss->ss_client_seq);
4585         if (ss->ss_client_seq == NULL)
4586                 RETURN(-ENOMEM);
4587
4588         OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
4589         if (prefix == NULL) {
4590                 OBD_FREE_PTR(ss->ss_client_seq);
4591                 ss->ss_client_seq = NULL;
4592                 RETURN(-ENOMEM);
4593         }
4594
4595         /* Note: seq_client_fini will be called in seq_site_fini */
4596         snprintf(prefix, MAX_OBD_NAME + 5, "ctl-%s", mdt_obd_name(mdt));
4597         rc = seq_client_init(ss->ss_client_seq, NULL, LUSTRE_SEQ_METADATA,
4598                              prefix, ss->ss_node_id == 0 ?  ss->ss_control_seq :
4599                                                             NULL);
4600         OBD_FREE(prefix, MAX_OBD_NAME + 5);
4601         if (rc != 0) {
4602                 OBD_FREE_PTR(ss->ss_client_seq);
4603                 ss->ss_client_seq = NULL;
4604                 RETURN(rc);
4605         }
4606
4607         rc = seq_server_set_cli(env, ss->ss_server_seq, ss->ss_client_seq);
4608
4609         RETURN(rc);
4610 }
4611
4612 static int mdt_seq_init(const struct lu_env *env, struct mdt_device *mdt)
4613 {
4614         struct seq_server_site  *ss;
4615         int                     rc;
4616         ENTRY;
4617
4618         ss = mdt_seq_site(mdt);
4619         /* init sequence controller server(MDT0) */
4620         if (ss->ss_node_id == 0) {
4621                 OBD_ALLOC_PTR(ss->ss_control_seq);
4622                 if (ss->ss_control_seq == NULL)
4623                         RETURN(-ENOMEM);
4624
4625                 rc = seq_server_init(env, ss->ss_control_seq, mdt->mdt_bottom,
4626                                      mdt_obd_name(mdt), LUSTRE_SEQ_CONTROLLER,
4627                                      ss);
4628                 if (rc)
4629                         GOTO(out_seq_fini, rc);
4630         }
4631
4632         /* Init normal sequence server */
4633         OBD_ALLOC_PTR(ss->ss_server_seq);
4634         if (ss->ss_server_seq == NULL)
4635                 GOTO(out_seq_fini, rc = -ENOMEM);
4636
4637         rc = seq_server_init(env, ss->ss_server_seq, mdt->mdt_bottom,
4638                              mdt_obd_name(mdt), LUSTRE_SEQ_SERVER, ss);
4639         if (rc)
4640                 GOTO(out_seq_fini, rc);
4641
4642         /* init seq client for seq server to talk to seq controller(MDT0) */
4643         rc = mdt_seq_init_cli(env, mdt);
4644         if (rc != 0)
4645                 GOTO(out_seq_fini, rc);
4646
4647         if (ss->ss_node_id != 0)
4648                 /* register controller export through lwp */
4649                 rc = mdt_register_seq_exp(mdt);
4650
4651         EXIT;
4652 out_seq_fini:
4653         if (rc)
4654                 mdt_seq_fini(env, mdt);
4655
4656         return rc;
4657 }
4658
4659 /*
4660  * FLD wrappers
4661  */
4662 static int mdt_fld_fini(const struct lu_env *env,
4663                         struct mdt_device *m)
4664 {
4665         struct seq_server_site *ss = mdt_seq_site(m);
4666         ENTRY;
4667
4668         if (ss && ss->ss_server_fld) {
4669                 fld_server_fini(env, ss->ss_server_fld);
4670                 OBD_FREE_PTR(ss->ss_server_fld);
4671                 ss->ss_server_fld = NULL;
4672         }
4673
4674         RETURN(0);
4675 }
4676
4677 static int mdt_fld_init(const struct lu_env *env,
4678                         const char *uuid,
4679                         struct mdt_device *m)
4680 {
4681         struct seq_server_site *ss;
4682         int rc;
4683         ENTRY;
4684
4685         ss = mdt_seq_site(m);
4686
4687         OBD_ALLOC_PTR(ss->ss_server_fld);
4688         if (ss->ss_server_fld == NULL)
4689                 RETURN(rc = -ENOMEM);
4690
4691         rc = fld_server_init(env, ss->ss_server_fld, m->mdt_bottom, uuid,
4692                              LU_SEQ_RANGE_MDT);
4693         if (rc) {
4694                 OBD_FREE_PTR(ss->ss_server_fld);
4695                 ss->ss_server_fld = NULL;
4696                 RETURN(rc);
4697         }
4698
4699         RETURN(0);
4700 }
4701
4702 static void mdt_stack_pre_fini(const struct lu_env *env,
4703                            struct mdt_device *m, struct lu_device *top)
4704 {
4705         struct lustre_cfg_bufs  *bufs;
4706         struct lustre_cfg       *lcfg;
4707         struct mdt_thread_info  *info;
4708         ENTRY;
4709
4710         LASSERT(top);
4711
4712         info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
4713         LASSERT(info != NULL);
4714
4715         bufs = &info->mti_u.bufs;
4716
4717         LASSERT(m->mdt_child_exp);
4718         LASSERT(m->mdt_child_exp->exp_obd);
4719
4720         /* process cleanup, pass mdt obd name to get obd umount flags */
4721         /* XXX: this is needed because all layers are referenced by
4722          * objects (some of them are pinned by osd, for example *
4723          * the proper solution should be a model where object used
4724          * by osd only doesn't have mdt/mdd slices -bzzz */
4725         lustre_cfg_bufs_reset(bufs, mdt_obd_name(m));
4726         lustre_cfg_bufs_set_string(bufs, 1, NULL);
4727         OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen));
4728         if (!lcfg)
4729                 RETURN_EXIT;
4730         lustre_cfg_init(lcfg, LCFG_PRE_CLEANUP, bufs);
4731
4732         top->ld_ops->ldo_process_config(env, top, lcfg);
4733         OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
4734         EXIT;
4735 }
4736
4737 static void mdt_stack_fini(const struct lu_env *env,
4738                            struct mdt_device *m, struct lu_device *top)
4739 {
4740         struct obd_device       *obd = mdt2obd_dev(m);
4741         struct lustre_cfg_bufs  *bufs;
4742         struct lustre_cfg       *lcfg;
4743         struct mdt_thread_info  *info;
4744         char                     flags[3] = "";
4745         ENTRY;
4746
4747         info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
4748         LASSERT(info != NULL);
4749
4750         lu_dev_del_linkage(top->ld_site, top);
4751
4752         lu_site_purge(env, top->ld_site, -1);
4753
4754         bufs = &info->mti_u.bufs;
4755         /* process cleanup, pass mdt obd name to get obd umount flags */
4756         /* another purpose is to let all layers to release their objects */
4757         lustre_cfg_bufs_reset(bufs, mdt_obd_name(m));
4758         if (obd->obd_force)
4759                 strcat(flags, "F");
4760         if (obd->obd_fail)
4761                 strcat(flags, "A");
4762         lustre_cfg_bufs_set_string(bufs, 1, flags);
4763         OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen));
4764         if (!lcfg)
4765                 RETURN_EXIT;
4766         lustre_cfg_init(lcfg, LCFG_CLEANUP, bufs);
4767
4768         LASSERT(top);
4769         top->ld_ops->ldo_process_config(env, top, lcfg);
4770         OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
4771
4772         lu_site_purge(env, top->ld_site, -1);
4773
4774         m->mdt_child = NULL;
4775         m->mdt_bottom = NULL;
4776
4777         obd_disconnect(m->mdt_child_exp);
4778         m->mdt_child_exp = NULL;
4779
4780         obd_disconnect(m->mdt_bottom_exp);
4781         m->mdt_child_exp = NULL;
4782 }
4783
4784 static int mdt_connect_to_next(const struct lu_env *env, struct mdt_device *m,
4785                                const char *next, struct obd_export **exp)
4786 {
4787         struct obd_connect_data *data = NULL;
4788         struct obd_device       *obd;
4789         int                      rc;
4790         ENTRY;
4791
4792         OBD_ALLOC_PTR(data);
4793         if (data == NULL)
4794                 GOTO(out, rc = -ENOMEM);
4795
4796         obd = class_name2obd(next);
4797         if (obd == NULL) {
4798                 CERROR("%s: can't locate next device: %s\n",
4799                        mdt_obd_name(m), next);
4800                 GOTO(out, rc = -ENOTCONN);
4801         }
4802
4803         data->ocd_connect_flags = OBD_CONNECT_VERSION;
4804         data->ocd_version = LUSTRE_VERSION_CODE;
4805
4806         rc = obd_connect(NULL, exp, obd, &obd->obd_uuid, data, NULL);
4807         if (rc) {
4808                 CERROR("%s: cannot connect to next dev %s (%d)\n",
4809                        mdt_obd_name(m), next, rc);
4810                 GOTO(out, rc);
4811         }
4812
4813 out:
4814         if (data)
4815                 OBD_FREE_PTR(data);
4816         RETURN(rc);
4817 }
4818
4819 static int mdt_stack_init(const struct lu_env *env, struct mdt_device *mdt,
4820                           struct lustre_cfg *cfg)
4821 {
4822         char                   *dev = lustre_cfg_string(cfg, 0);
4823         int                     rc, name_size, uuid_size;
4824         char                   *name, *uuid, *p;
4825         struct lustre_cfg_bufs *bufs;
4826         struct lustre_cfg      *lcfg;
4827         struct obd_device      *obd;
4828         struct lustre_profile  *lprof;
4829         struct lu_site         *site;
4830         ENTRY;
4831
4832         /* in 1.8 we had the only device in the stack - MDS.
4833          * 2.0 introduces MDT, MDD, OSD; MDT starts others internally.
4834          * in 2.3 OSD is instantiated by obd_mount.c, so we need
4835          * to generate names and setup MDT, MDD. MDT will be using
4836          * generated name to connect to MDD. for MDD the next device
4837          * will be LOD with name taken from so called "profile" which
4838          * is generated by mount_option line
4839          *
4840          * 1.8 MGS generates config. commands like this:
4841          *   #06 (104)mount_option 0:  1:lustre-MDT0000  2:lustre-mdtlov
4842          *   #08 (120)setup   0:lustre-MDT0000  1:dev 2:type 3:lustre-MDT0000
4843          * 2.0 MGS generates config. commands like this:
4844          *   #07 (112)mount_option 0:  1:lustre-MDT0000  2:lustre-MDT0000-mdtlov
4845          *   #08 (160)setup   0:lustre-MDT0000  1:lustre-MDT0000_UUID  2:0
4846          *                    3:lustre-MDT0000-mdtlov  4:f
4847          *
4848          * we generate MDD name from MDT one, just replacing T with D
4849          *
4850          * after all the preparations, the logical equivalent will be
4851          *   #01 (160)setup   0:lustre-MDD0000  1:lustre-MDD0000_UUID  2:0
4852          *                    3:lustre-MDT0000-mdtlov  4:f
4853          *   #02 (160)setup   0:lustre-MDT0000  1:lustre-MDT0000_UUID  2:0
4854          *                    3:lustre-MDD0000  4:f
4855          *
4856          *  notice we build the stack from down to top: MDD first, then MDT */
4857
4858         name_size = MAX_OBD_NAME;
4859         uuid_size = MAX_OBD_NAME;
4860
4861         OBD_ALLOC(name, name_size);
4862         OBD_ALLOC(uuid, uuid_size);
4863         if (name == NULL || uuid == NULL)
4864                 GOTO(cleanup_mem, rc = -ENOMEM);
4865
4866         OBD_ALLOC_PTR(bufs);
4867         if (!bufs)
4868                 GOTO(cleanup_mem, rc = -ENOMEM);
4869
4870         strcpy(name, dev);
4871         p = strstr(name, "-MDT");
4872         if (p == NULL)
4873                 GOTO(free_bufs, rc = -ENOMEM);
4874         p[3] = 'D';
4875
4876         snprintf(uuid, MAX_OBD_NAME, "%s_UUID", name);
4877
4878         lprof = class_get_profile(lustre_cfg_string(cfg, 0));
4879         if (lprof == NULL || lprof->lp_dt == NULL) {
4880                 CERROR("can't find the profile: %s\n",
4881                        lustre_cfg_string(cfg, 0));
4882                 GOTO(free_bufs, rc = -EINVAL);
4883         }
4884
4885         lustre_cfg_bufs_reset(bufs, name);
4886         lustre_cfg_bufs_set_string(bufs, 1, LUSTRE_MDD_NAME);
4887         lustre_cfg_bufs_set_string(bufs, 2, uuid);
4888         lustre_cfg_bufs_set_string(bufs, 3, lprof->lp_dt);
4889
4890         OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen));
4891         if (!lcfg)
4892                 GOTO(put_profile, rc = -ENOMEM);
4893         lustre_cfg_init(lcfg, LCFG_ATTACH, bufs);
4894
4895         rc = class_attach(lcfg);
4896         if (rc)
4897                 GOTO(lcfg_cleanup, rc);
4898
4899         obd = class_name2obd(name);
4900         if (!obd) {
4901                 CERROR("Can not find obd %s (%s in config)\n",
4902                        MDD_OBD_NAME, lustre_cfg_string(cfg, 0));
4903                 GOTO(lcfg_cleanup, rc = -EINVAL);
4904         }
4905
4906         OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
4907
4908         lustre_cfg_bufs_reset(bufs, name);
4909         lustre_cfg_bufs_set_string(bufs, 1, uuid);
4910         lustre_cfg_bufs_set_string(bufs, 2, dev);
4911         lustre_cfg_bufs_set_string(bufs, 3, lprof->lp_dt);
4912
4913         OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen));
4914         if (!lcfg)
4915                 GOTO(class_detach, rc = -ENOMEM);
4916         lustre_cfg_init(lcfg, LCFG_SETUP, bufs);
4917
4918         rc = class_setup(obd, lcfg);
4919         if (rc)
4920                 GOTO(class_detach, rc);
4921
4922         /* connect to MDD we just setup */
4923         rc = mdt_connect_to_next(env, mdt, name, &mdt->mdt_child_exp);
4924         if (rc)
4925                 GOTO(class_detach, rc);
4926
4927         site = mdt->mdt_child_exp->exp_obd->obd_lu_dev->ld_site;
4928         LASSERT(site);
4929         LASSERT(mdt_lu_site(mdt) == NULL);
4930         mdt->mdt_lu_dev.ld_site = site;
4931         site->ls_top_dev = &mdt->mdt_lu_dev;
4932         mdt->mdt_child = lu2md_dev(mdt->mdt_child_exp->exp_obd->obd_lu_dev);
4933
4934         /* now connect to bottom OSD */
4935         snprintf(name, MAX_OBD_NAME, "%s-osd", dev);
4936         rc = mdt_connect_to_next(env, mdt, name, &mdt->mdt_bottom_exp);
4937         if (rc)
4938                 GOTO(class_detach, rc);
4939         mdt->mdt_bottom =
4940                 lu2dt_dev(mdt->mdt_bottom_exp->exp_obd->obd_lu_dev);
4941
4942         rc = lu_env_refill((struct lu_env *)env);
4943         if (rc != 0)
4944                 CERROR("Failure to refill session: '%d'\n", rc);
4945
4946         lu_dev_add_linkage(site, &mdt->mdt_lu_dev);
4947
4948         EXIT;
4949 class_detach:
4950         if (rc)
4951                 class_detach(obd, lcfg);
4952 lcfg_cleanup:
4953         OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
4954 put_profile:
4955         class_put_profile(lprof);
4956 free_bufs:
4957         OBD_FREE_PTR(bufs);
4958 cleanup_mem:
4959         if (name)
4960                 OBD_FREE(name, name_size);
4961         if (uuid)
4962                 OBD_FREE(uuid, uuid_size);
4963         RETURN(rc);
4964 }
4965
4966 /* setup quota master target on MDT0 */
4967 static int mdt_quota_init(const struct lu_env *env, struct mdt_device *mdt,
4968                           struct lustre_cfg *cfg)
4969 {
4970         struct obd_device       *obd;
4971         char                    *dev = lustre_cfg_string(cfg, 0);
4972         char                    *qmtname, *uuid, *p;
4973         struct lustre_cfg_bufs  *bufs;
4974         struct lustre_cfg       *lcfg;
4975         struct lustre_profile   *lprof;
4976         struct obd_connect_data *data;
4977         int                      rc;
4978         ENTRY;
4979
4980         LASSERT(mdt->mdt_qmt_exp == NULL);
4981         LASSERT(mdt->mdt_qmt_dev == NULL);
4982
4983         /* quota master is on MDT0 only for now */
4984         if (mdt->mdt_seq_site.ss_node_id != 0)
4985                 RETURN(0);
4986
4987         /* MGS generates config commands which look as follows:
4988          *   #01 (160)setup   0:lustre-MDT0000  1:lustre-MDT0000_UUID  2:0
4989          *                    3:lustre-MDT0000-mdtlov  4:f
4990          *
4991          * We generate the QMT name from the MDT one, just replacing MD with QM
4992          * after all the preparations, the logical equivalent will be:
4993          *   #01 (160)setup   0:lustre-QMT0000  1:lustre-QMT0000_UUID  2:0
4994          *                    3:lustre-MDT0000-osd  4:f */
4995         OBD_ALLOC(qmtname, MAX_OBD_NAME);
4996         OBD_ALLOC(uuid, UUID_MAX);
4997         OBD_ALLOC_PTR(bufs);
4998         OBD_ALLOC_PTR(data);
4999         if (qmtname == NULL || uuid == NULL || bufs == NULL || data == NULL)
5000                 GOTO(cleanup_mem, rc = -ENOMEM);
5001
5002         strcpy(qmtname, dev);
5003         p = strstr(qmtname, "-MDT");
5004         if (p == NULL)
5005                 GOTO(cleanup_mem, rc = -ENOMEM);
5006         /* replace MD with QM */
5007         p[1] = 'Q';
5008         p[2] = 'M';
5009
5010         snprintf(uuid, UUID_MAX, "%s_UUID", qmtname);
5011
5012         lprof = class_get_profile(lustre_cfg_string(cfg, 0));
5013         if (lprof == NULL || lprof->lp_dt == NULL) {
5014                 CERROR("can't find profile for %s\n",
5015                        lustre_cfg_string(cfg, 0));
5016                 GOTO(cleanup_mem, rc = -EINVAL);
5017         }
5018
5019         lustre_cfg_bufs_reset(bufs, qmtname);
5020         lustre_cfg_bufs_set_string(bufs, 1, LUSTRE_QMT_NAME);
5021         lustre_cfg_bufs_set_string(bufs, 2, uuid);
5022         lustre_cfg_bufs_set_string(bufs, 3, lprof->lp_dt);
5023
5024         OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen));
5025         if (!lcfg)
5026                 GOTO(put_profile, rc = -ENOMEM);
5027         lustre_cfg_init(lcfg, LCFG_ATTACH, bufs);
5028
5029         rc = class_attach(lcfg);
5030         if (rc)
5031                 GOTO(lcfg_cleanup, rc);
5032
5033         obd = class_name2obd(qmtname);
5034         if (!obd) {
5035                 CERROR("Can not find obd %s (%s in config)\n", qmtname,
5036                        lustre_cfg_string(cfg, 0));
5037                 GOTO(lcfg_cleanup, rc = -EINVAL);
5038         }
5039
5040         OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
5041
5042         lustre_cfg_bufs_reset(bufs, qmtname);
5043         lustre_cfg_bufs_set_string(bufs, 1, uuid);
5044         lustre_cfg_bufs_set_string(bufs, 2, dev);
5045
5046         /* for quota, the next device should be the OSD device */
5047         lustre_cfg_bufs_set_string(bufs, 3,
5048                                    mdt->mdt_bottom->dd_lu_dev.ld_obd->obd_name);
5049
5050         OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen));
5051         if (!lcfg)
5052                 GOTO(class_detach, rc = -ENOMEM);
5053         lustre_cfg_init(lcfg, LCFG_SETUP, bufs);
5054
5055         rc = class_setup(obd, lcfg);
5056         if (rc)
5057                 GOTO(class_detach, rc);
5058
5059         mdt->mdt_qmt_dev = obd->obd_lu_dev;
5060
5061         /* configure local quota objects */
5062         rc = mdt->mdt_qmt_dev->ld_ops->ldo_prepare(env,
5063                                                    &mdt->mdt_lu_dev,
5064                                                    mdt->mdt_qmt_dev);
5065         if (rc)
5066                 GOTO(class_cleanup, rc);
5067
5068         /* connect to quota master target */
5069         data->ocd_connect_flags = OBD_CONNECT_VERSION;
5070         data->ocd_version = LUSTRE_VERSION_CODE;
5071         rc = obd_connect(NULL, &mdt->mdt_qmt_exp, obd, &obd->obd_uuid,
5072                          data, NULL);
5073         if (rc) {
5074                 CERROR("cannot connect to quota master device %s (%d)\n",
5075                        qmtname, rc);
5076                 GOTO(class_cleanup, rc);
5077         }
5078
5079         EXIT;
5080 class_cleanup:
5081         if (rc) {
5082                 class_manual_cleanup(obd);
5083                 mdt->mdt_qmt_dev = NULL;
5084         }
5085 class_detach:
5086         if (rc)
5087                 class_detach(obd, lcfg);
5088 lcfg_cleanup:
5089         OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
5090 put_profile:
5091         class_put_profile(lprof);
5092 cleanup_mem:
5093         if (bufs)
5094                 OBD_FREE_PTR(bufs);
5095         if (qmtname)
5096                 OBD_FREE(qmtname, MAX_OBD_NAME);
5097         if (uuid)
5098                 OBD_FREE(uuid, UUID_MAX);
5099         if (data)
5100                 OBD_FREE_PTR(data);
5101         return rc;
5102 }
5103
5104 /* Shutdown quota master target associated with mdt */
5105 static void mdt_quota_fini(const struct lu_env *env, struct mdt_device *mdt)
5106 {
5107         ENTRY;
5108
5109         if (mdt->mdt_qmt_exp == NULL)
5110                 RETURN_EXIT;
5111         LASSERT(mdt->mdt_qmt_dev != NULL);
5112
5113         /* the qmt automatically shuts down when the mdt disconnects */
5114         obd_disconnect(mdt->mdt_qmt_exp);
5115         mdt->mdt_qmt_exp = NULL;
5116         mdt->mdt_qmt_dev = NULL;
5117         EXIT;
5118 }
5119
5120 /* mdt_getxattr() is used from mdt_intent_getxattr(), use this wrapper
5121  * for now. This will be removed along with converting rest of MDT code
5122  * to use tgt_session_info */
5123 static int mdt_tgt_getxattr(struct tgt_session_info *tsi)
5124 {
5125         struct mdt_thread_info  *info = tsi2mdt_info(tsi);
5126         int                      rc;
5127
5128         if (unlikely(info->mti_object == NULL))
5129                 return -EPROTO;
5130
5131         rc = mdt_getxattr(info);
5132
5133         mdt_thread_info_fini(info);
5134         return rc;
5135 }
5136
5137 #define OBD_FAIL_OST_READ_NET   OBD_FAIL_OST_BRW_NET
5138 #define OBD_FAIL_OST_WRITE_NET  OBD_FAIL_OST_BRW_NET
5139 #define OST_BRW_READ    OST_READ
5140 #define OST_BRW_WRITE   OST_WRITE
5141
5142 static struct tgt_handler mdt_tgt_handlers[] = {
5143 TGT_RPC_HANDLER(MDS_FIRST_OPC,
5144                 0,                      MDS_CONNECT,    mdt_tgt_connect,
5145                 &RQF_CONNECT, LUSTRE_OBD_VERSION),
5146 TGT_RPC_HANDLER(MDS_FIRST_OPC,
5147                 0,                      MDS_DISCONNECT, tgt_disconnect,
5148                 &RQF_MDS_DISCONNECT, LUSTRE_OBD_VERSION),
5149 TGT_RPC_HANDLER(MDS_FIRST_OPC,
5150                 HABEO_REFERO,           MDS_SET_INFO,   mdt_set_info,
5151                 &RQF_OBD_SET_INFO, LUSTRE_MDS_VERSION),
5152 TGT_MDT_HDL(0,                          MDS_GET_INFO,   mdt_get_info),
5153 TGT_MDT_HDL(0           | HABEO_REFERO, MDS_GET_ROOT,   mdt_get_root),
5154 TGT_MDT_HDL(HABEO_CORPUS,               MDS_GETATTR,    mdt_getattr),
5155 TGT_MDT_HDL(HABEO_CORPUS| HABEO_REFERO, MDS_GETATTR_NAME,
5156                                                         mdt_getattr_name),
5157 TGT_MDT_HDL(HABEO_CORPUS,               MDS_GETXATTR,   mdt_tgt_getxattr),
5158 TGT_MDT_HDL(0           | HABEO_REFERO, MDS_STATFS,     mdt_statfs),
5159 TGT_MDT_HDL(0           | MUTABOR,      MDS_REINT,      mdt_reint),
5160 TGT_MDT_HDL(HABEO_CORPUS,               MDS_CLOSE,      mdt_close),
5161 TGT_MDT_HDL(HABEO_CORPUS| HABEO_REFERO, MDS_READPAGE,   mdt_readpage),
5162 TGT_MDT_HDL(HABEO_CORPUS| HABEO_REFERO, MDS_SYNC,       mdt_sync),
5163 TGT_MDT_HDL(0,                          MDS_QUOTACTL,   mdt_quotactl),
5164 TGT_MDT_HDL(HABEO_CORPUS| HABEO_REFERO | MUTABOR, MDS_HSM_PROGRESS,
5165                                                         mdt_hsm_progress),
5166 TGT_MDT_HDL(HABEO_CORPUS| HABEO_REFERO | MUTABOR, MDS_HSM_CT_REGISTER,
5167                                                         mdt_hsm_ct_register),
5168 TGT_MDT_HDL(HABEO_CORPUS| HABEO_REFERO | MUTABOR, MDS_HSM_CT_UNREGISTER,
5169                                                         mdt_hsm_ct_unregister),
5170 TGT_MDT_HDL(HABEO_CORPUS| HABEO_REFERO, MDS_HSM_STATE_GET,
5171                                                         mdt_hsm_state_get),
5172 TGT_MDT_HDL(HABEO_CORPUS| HABEO_REFERO | MUTABOR, MDS_HSM_STATE_SET,
5173                                                         mdt_hsm_state_set),
5174 TGT_MDT_HDL(HABEO_CORPUS| HABEO_REFERO, MDS_HSM_ACTION, mdt_hsm_action),
5175 TGT_MDT_HDL(HABEO_CORPUS| HABEO_REFERO, MDS_HSM_REQUEST,
5176                                                         mdt_hsm_request),
5177 TGT_MDT_HDL(HABEO_CLAVIS | HABEO_CORPUS | HABEO_REFERO | MUTABOR,
5178             MDS_SWAP_LAYOUTS,
5179             mdt_swap_layouts),
5180 TGT_MDT_HDL(0,          MDS_RMFID,      mdt_rmfid),
5181 };
5182
5183 static struct tgt_handler mdt_io_ops[] = {
5184 TGT_OST_HDL_HP(HABEO_CORPUS | HABEO_REFERO, OST_BRW_READ, tgt_brw_read,
5185                                                         mdt_hp_brw),
5186 TGT_OST_HDL_HP(HABEO_CORPUS | MUTABOR,   OST_BRW_WRITE, tgt_brw_write,
5187                                                         mdt_hp_brw),
5188 TGT_OST_HDL_HP(HABEO_CORPUS | HABEO_REFERO | MUTABOR,
5189                                          OST_PUNCH,     mdt_punch_hdl,
5190                                                         mdt_hp_punch),
5191 TGT_OST_HDL(HABEO_CORPUS | HABEO_REFERO, OST_SYNC,      mdt_data_sync),
5192 };
5193
5194 static struct tgt_handler mdt_sec_ctx_ops[] = {
5195 TGT_SEC_HDL_VAR(0,                      SEC_CTX_INIT,     mdt_sec_ctx_handle),
5196 TGT_SEC_HDL_VAR(0,                      SEC_CTX_INIT_CONT,mdt_sec_ctx_handle),
5197 TGT_SEC_HDL_VAR(0,                      SEC_CTX_FINI,     mdt_sec_ctx_handle)
5198 };
5199
5200 static struct tgt_handler mdt_quota_ops[] = {
5201 TGT_QUOTA_HDL(HABEO_REFERO,             QUOTA_DQACQ,      mdt_quota_dqacq),
5202 };
5203
5204 static struct tgt_opc_slice mdt_common_slice[] = {
5205         {
5206                 .tos_opc_start  = MDS_FIRST_OPC,
5207                 .tos_opc_end    = MDS_LAST_OPC,
5208                 .tos_hs         = mdt_tgt_handlers
5209         },
5210         {
5211                 .tos_opc_start  = OBD_FIRST_OPC,
5212                 .tos_opc_end    = OBD_LAST_OPC,
5213                 .tos_hs         = tgt_obd_handlers
5214         },
5215         {
5216                 .tos_opc_start  = LDLM_FIRST_OPC,
5217                 .tos_opc_end    = LDLM_LAST_OPC,
5218                 .tos_hs         = tgt_dlm_handlers
5219         },
5220         {
5221                 .tos_opc_start  = SEC_FIRST_OPC,
5222                 .tos_opc_end    = SEC_LAST_OPC,
5223                 .tos_hs         = mdt_sec_ctx_ops
5224         },
5225         {
5226                 .tos_opc_start  = OUT_UPDATE_FIRST_OPC,
5227                 .tos_opc_end    = OUT_UPDATE_LAST_OPC,
5228                 .tos_hs         = tgt_out_handlers
5229         },
5230         {
5231                 .tos_opc_start  = FLD_FIRST_OPC,
5232                 .tos_opc_end    = FLD_LAST_OPC,
5233                 .tos_hs         = fld_handlers
5234         },
5235         {
5236                 .tos_opc_start  = SEQ_FIRST_OPC,
5237                 .tos_opc_end    = SEQ_LAST_OPC,
5238                 .tos_hs         = seq_handlers
5239         },
5240         {
5241                 .tos_opc_start  = QUOTA_DQACQ,
5242                 .tos_opc_end    = QUOTA_LAST_OPC,
5243                 .tos_hs         = mdt_quota_ops
5244         },
5245         {
5246                 .tos_opc_start  = LLOG_FIRST_OPC,
5247                 .tos_opc_end    = LLOG_LAST_OPC,
5248                 .tos_hs         = tgt_llog_handlers
5249         },
5250         {
5251                 .tos_opc_start  = LFSCK_FIRST_OPC,
5252                 .tos_opc_end    = LFSCK_LAST_OPC,
5253                 .tos_hs         = tgt_lfsck_handlers
5254         },
5255         {
5256                 .tos_opc_start  = OST_FIRST_OPC,
5257                 .tos_opc_end    = OST_LAST_OPC,
5258                 .tos_hs         = mdt_io_ops
5259         },
5260         {
5261                 .tos_hs         = NULL
5262         }
5263 };
5264
5265 static void mdt_fini(const struct lu_env *env, struct mdt_device *m)
5266 {
5267         struct md_device *next = m->mdt_child;
5268         struct lu_device *d = &m->mdt_lu_dev;
5269         struct obd_device *obd = mdt2obd_dev(m);
5270         struct lfsck_stop stop;
5271
5272         ENTRY;
5273         stop.ls_status = LS_PAUSED;
5274         stop.ls_flags = 0;
5275         next->md_ops->mdo_iocontrol(env, next, OBD_IOC_STOP_LFSCK, 0, &stop);
5276
5277         mdt_stack_pre_fini(env, m, md2lu_dev(m->mdt_child));
5278         ping_evictor_stop();
5279
5280         /* Remove the HSM /proc entry so the coordinator cannot be
5281          * restarted by a user while it's shutting down. */
5282         hsm_cdt_procfs_fini(m);
5283         mdt_hsm_cdt_stop(m);
5284
5285         mdt_llog_ctxt_unclone(env, m, LLOG_AGENT_ORIG_CTXT);
5286         mdt_llog_ctxt_unclone(env, m, LLOG_CHANGELOG_ORIG_CTXT);
5287
5288         if (m->mdt_namespace != NULL)
5289                 ldlm_namespace_free_prior(m->mdt_namespace, NULL,
5290                                           d->ld_obd->obd_force);
5291
5292         obd_exports_barrier(obd);
5293         obd_zombie_barrier();
5294
5295         mdt_quota_fini(env, m);
5296
5297         cfs_free_nidlist(&m->mdt_squash.rsi_nosquash_nids);
5298
5299         /* Calling the cleanup functions in the same order as in the mdt_init0
5300          * error path
5301          */
5302         mdt_procfs_fini(m);
5303
5304         target_recovery_fini(obd);
5305         upcall_cache_cleanup(m->mdt_identity_cache);
5306         m->mdt_identity_cache = NULL;
5307
5308         mdt_fs_cleanup(env, m);
5309
5310         tgt_fini(env, &m->mdt_lut);
5311
5312         mdt_hsm_cdt_fini(m);
5313
5314         if (m->mdt_los != NULL) {
5315                 local_oid_storage_fini(env, m->mdt_los);
5316                 m->mdt_los = NULL;
5317         }
5318
5319         if (m->mdt_namespace != NULL) {
5320                 ldlm_namespace_free_post(m->mdt_namespace);
5321                 d->ld_obd->obd_namespace = m->mdt_namespace = NULL;
5322         }
5323
5324         if (m->mdt_md_root != NULL) {
5325                 mdt_object_put(env, m->mdt_md_root);
5326                 m->mdt_md_root = NULL;
5327         }
5328
5329         mdt_seq_fini(env, m);
5330
5331         mdt_fld_fini(env, m);
5332
5333         /*
5334          * Finish the stack
5335          */
5336         mdt_stack_fini(env, m, md2lu_dev(m->mdt_child));
5337
5338         LASSERT(atomic_read(&d->ld_ref) == 0);
5339
5340         server_put_mount(mdt_obd_name(m), true);
5341
5342         EXIT;
5343 }
5344
5345 static int mdt_postrecov(const struct lu_env *, struct mdt_device *);
5346
5347 static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
5348                      struct lu_device_type *ldt, struct lustre_cfg *cfg)
5349 {
5350         const struct dt_device_param *dt_conf;
5351         struct mdt_thread_info *info;
5352         struct obd_device *obd;
5353         const char *dev = lustre_cfg_string(cfg, 0);
5354         const char *num = lustre_cfg_string(cfg, 2);
5355         struct tg_grants_data *tgd = &m->mdt_lut.lut_tgd;
5356         struct lustre_mount_info *lmi = NULL;
5357         struct lustre_sb_info *lsi;
5358         struct lu_site *s;
5359         struct seq_server_site *ss_site;
5360         const char *identity_upcall = "NONE";
5361         struct md_device *next;
5362         struct lu_fid fid;
5363         int rc;
5364         long node_id;
5365         mntopt_t mntopts;
5366         ENTRY;
5367
5368         lu_device_init(&m->mdt_lu_dev, ldt);
5369         /*
5370          * Environment (env) might be missing mdt_thread_key values at that
5371          * point, if device is allocated when mdt_thread_key is in QUIESCENT
5372          * mode.
5373          *
5374          * Usually device allocation path doesn't use module key values, but
5375          * mdt has to do a lot of work here, so allocate key value.
5376          */
5377         rc = lu_env_refill((struct lu_env *)env);
5378         if (rc != 0)
5379                 RETURN(rc);
5380
5381         info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
5382         LASSERT(info != NULL);
5383
5384         obd = class_name2obd(dev);
5385         LASSERT(obd != NULL);
5386
5387         m->mdt_max_mdsize = MAX_MD_SIZE_OLD;
5388         m->mdt_opts.mo_evict_tgt_nids = 1;
5389         m->mdt_opts.mo_cos = MDT_COS_DEFAULT;
5390
5391         lmi = server_get_mount(dev);
5392         if (lmi == NULL) {
5393                 CERROR("Cannot get mount info for %s!\n", dev);
5394                 RETURN(-EFAULT);
5395         } else {
5396                 lsi = s2lsi(lmi->lmi_sb);
5397                 /* CMD is supported only in IAM mode */
5398                 LASSERT(num);
5399                 node_id = simple_strtol(num, NULL, 10);
5400                 obd->u.obt.obt_magic = OBT_MAGIC;
5401                 if (lsi->lsi_lmd != NULL &&
5402                     lsi->lsi_lmd->lmd_flags & LMD_FLG_SKIP_LFSCK)
5403                         m->mdt_skip_lfsck = 1;
5404         }
5405
5406         /* DoM files get IO lock at open by default */
5407         m->mdt_opts.mo_dom_lock = ALWAYS_DOM_LOCK_ON_OPEN;
5408         /* DoM files are read at open and data is packed in the reply */
5409         m->mdt_opts.mo_dom_read_open = 1;
5410
5411         m->mdt_squash.rsi_uid = 0;
5412         m->mdt_squash.rsi_gid = 0;
5413         INIT_LIST_HEAD(&m->mdt_squash.rsi_nosquash_nids);
5414         init_rwsem(&m->mdt_squash.rsi_sem);
5415         spin_lock_init(&m->mdt_lock);
5416         m->mdt_enable_remote_dir = 1;
5417         m->mdt_enable_striped_dir = 1;
5418         m->mdt_enable_dir_migration = 1;
5419         m->mdt_enable_remote_dir_gid = 0;
5420         m->mdt_enable_chprojid_gid = 0;
5421         m->mdt_enable_remote_rename = 1;
5422         m->mdt_enable_remote_subdir_mount = 1;
5423
5424         atomic_set(&m->mdt_mds_mds_conns, 0);
5425         atomic_set(&m->mdt_async_commit_count, 0);
5426
5427         m->mdt_lu_dev.ld_ops = &mdt_lu_ops;
5428         m->mdt_lu_dev.ld_obd = obd;
5429         /* Set this lu_device to obd for error handling purposes. */
5430         obd->obd_lu_dev = &m->mdt_lu_dev;
5431
5432         /* init the stack */
5433         rc = mdt_stack_init((struct lu_env *)env, m, cfg);
5434         if (rc) {
5435                 CERROR("%s: Can't init device stack, rc %d\n",
5436                        mdt_obd_name(m), rc);
5437                 GOTO(err_lmi, rc);
5438         }
5439
5440         s = mdt_lu_site(m);
5441         ss_site = mdt_seq_site(m);
5442         s->ld_seq_site = ss_site;
5443         ss_site->ss_lu = s;
5444
5445         /* set server index */
5446         ss_site->ss_node_id = node_id;
5447
5448         /* failover is the default
5449          * FIXME: we do not failout mds0/mgs, which may cause some problems.
5450          * assumed whose ss_node_id == 0 XXX
5451          * */
5452         obd->obd_replayable = 1;
5453         /* No connection accepted until configurations will finish */
5454         obd->obd_no_conn = 1;
5455
5456         if (cfg->lcfg_bufcount > 4 && LUSTRE_CFG_BUFLEN(cfg, 4) > 0) {
5457                 char *str = lustre_cfg_string(cfg, 4);
5458                 if (strchr(str, 'n')) {
5459                         CWARN("%s: recovery disabled\n", mdt_obd_name(m));
5460                         obd->obd_replayable = 0;
5461                 }
5462         }
5463
5464         rc = mdt_fld_init(env, mdt_obd_name(m), m);
5465         if (rc)
5466                 GOTO(err_fini_stack, rc);
5467
5468         rc = mdt_seq_init(env, m);
5469         if (rc)
5470                 GOTO(err_fini_fld, rc);
5471
5472         snprintf(info->mti_u.ns_name, sizeof(info->mti_u.ns_name), "%s-%s",
5473                  LUSTRE_MDT_NAME, obd->obd_uuid.uuid);
5474         m->mdt_namespace = ldlm_namespace_new(obd, info->mti_u.ns_name,
5475                                               LDLM_NAMESPACE_SERVER,
5476                                               LDLM_NAMESPACE_GREEDY,
5477                                               LDLM_NS_TYPE_MDT);
5478         if (m->mdt_namespace == NULL)
5479                 GOTO(err_fini_seq, rc = -ENOMEM);
5480
5481         m->mdt_namespace->ns_lvbp = m;
5482         m->mdt_namespace->ns_lvbo = &mdt_lvbo;
5483
5484         ldlm_register_intent(m->mdt_namespace, mdt_intent_policy);
5485         /* set obd_namespace for compatibility with old code */
5486         obd->obd_namespace = m->mdt_namespace;
5487
5488         rc = tgt_init(env, &m->mdt_lut, obd, m->mdt_bottom, mdt_common_slice,
5489                       OBD_FAIL_MDS_ALL_REQUEST_NET,
5490                       OBD_FAIL_MDS_ALL_REPLY_NET);
5491         if (rc)
5492                 GOTO(err_free_ns, rc);
5493
5494         /* Amount of available space excluded from granting and reserved
5495          * for metadata. It is in percentage and 50% is default value. */
5496         tgd->tgd_reserved_pcnt = 50;
5497
5498         if (ONE_MB_BRW_SIZE < (1U << tgd->tgd_blockbits))
5499                 m->mdt_brw_size = 1U << tgd->tgd_blockbits;
5500         else
5501                 m->mdt_brw_size = ONE_MB_BRW_SIZE;
5502
5503         rc = mdt_fs_setup(env, m, obd, lsi);
5504         if (rc)
5505                 GOTO(err_tgt, rc);
5506
5507         fid.f_seq = FID_SEQ_LOCAL_NAME;
5508         fid.f_oid = 1;
5509         fid.f_ver = 0;
5510         rc = local_oid_storage_init(env, m->mdt_bottom, &fid, &m->mdt_los);
5511         if (rc != 0)
5512                 GOTO(err_fs_cleanup, rc);
5513
5514         rc = mdt_hsm_cdt_init(m);
5515         if (rc != 0) {
5516                 CERROR("%s: error initializing coordinator, rc %d\n",
5517                        mdt_obd_name(m), rc);
5518                 GOTO(err_los_fini, rc);
5519         }
5520
5521         tgt_adapt_sptlrpc_conf(&m->mdt_lut);
5522
5523         next = m->mdt_child;
5524         dt_conf = next->md_ops->mdo_dtconf_get(env, next);
5525
5526         mntopts = dt_conf->ddp_mntopts;
5527
5528         if (mntopts & MNTOPT_USERXATTR)
5529                 m->mdt_opts.mo_user_xattr = 1;
5530         else
5531                 m->mdt_opts.mo_user_xattr = 0;
5532
5533         m->mdt_max_ea_size = dt_conf->ddp_max_ea_size;
5534
5535         if (mntopts & MNTOPT_ACL)
5536                 m->mdt_opts.mo_acl = 1;
5537         else
5538                 m->mdt_opts.mo_acl = 0;
5539
5540         /* XXX: to support suppgid for ACL, we enable identity_upcall
5541          * by default, otherwise, maybe got unexpected -EACCESS. */
5542         if (m->mdt_opts.mo_acl)
5543                 identity_upcall = MDT_IDENTITY_UPCALL_PATH;
5544
5545         m->mdt_identity_cache = upcall_cache_init(mdt_obd_name(m),
5546                                                 identity_upcall,
5547                                                 &mdt_identity_upcall_cache_ops);
5548         if (IS_ERR(m->mdt_identity_cache)) {
5549                 rc = PTR_ERR(m->mdt_identity_cache);
5550                 m->mdt_identity_cache = NULL;
5551                 GOTO(err_free_hsm, rc);
5552         }
5553
5554         rc = mdt_procfs_init(m, dev);
5555         if (rc) {
5556                 CERROR("Can't init MDT lprocfs, rc %d\n", rc);
5557                 GOTO(err_recovery, rc);
5558         }
5559
5560         rc = mdt_quota_init(env, m, cfg);
5561         if (rc)
5562                 GOTO(err_procfs, rc);
5563
5564         m->mdt_ldlm_client = &mdt2obd_dev(m)->obd_ldlm_client;
5565         ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
5566                            "mdt_ldlm_client", m->mdt_ldlm_client);
5567
5568         ping_evictor_start();
5569
5570         /* recovery will be started upon mdt_prepare()
5571          * when the whole stack is complete and ready
5572          * to serve the requests */
5573
5574         /* Reduce the initial timeout on an MDS because it doesn't need such
5575          * a long timeout as an OST does. Adaptive timeouts will adjust this
5576          * value appropriately. */
5577         if (ldlm_timeout == LDLM_TIMEOUT_DEFAULT)
5578                 ldlm_timeout = MDS_LDLM_TIMEOUT_DEFAULT;
5579
5580         RETURN(0);
5581 err_procfs:
5582         mdt_procfs_fini(m);
5583 err_recovery:
5584         target_recovery_fini(obd);
5585         upcall_cache_cleanup(m->mdt_identity_cache);
5586         m->mdt_identity_cache = NULL;
5587 err_free_hsm:
5588         mdt_hsm_cdt_fini(m);
5589 err_los_fini:
5590         local_oid_storage_fini(env, m->mdt_los);
5591         m->mdt_los = NULL;
5592 err_fs_cleanup:
5593         mdt_fs_cleanup(env, m);
5594 err_tgt:
5595         tgt_fini(env, &m->mdt_lut);
5596 err_free_ns:
5597         ldlm_namespace_free(m->mdt_namespace, NULL, 0);
5598         obd->obd_namespace = m->mdt_namespace = NULL;
5599 err_fini_seq:
5600         mdt_seq_fini(env, m);
5601 err_fini_fld:
5602         mdt_fld_fini(env, m);
5603 err_fini_stack:
5604         mdt_stack_fini(env, m, md2lu_dev(m->mdt_child));
5605 err_lmi:
5606         if (lmi)
5607                 server_put_mount(dev, true);
5608         return(rc);
5609 }
5610
5611 /* For interoperability, the left element is old parameter, the right one
5612  * is the new version of the parameter, if some parameter is deprecated,
5613  * the new version should be set as NULL. */
5614 static struct cfg_interop_param mdt_interop_param[] = {
5615         { "mdt.group_upcall",   NULL },
5616         { "mdt.quota_type",     NULL },
5617         { "mdd.quota_type",     NULL },
5618         { "mdt.som",            NULL },
5619         { "mdt.rootsquash",     "mdt.root_squash" },
5620         { "mdt.nosquash_nid",   "mdt.nosquash_nids" },
5621         { NULL }
5622 };
5623
5624 /* used by MGS to process specific configurations */
5625 static int mdt_process_config(const struct lu_env *env,
5626                               struct lu_device *d, struct lustre_cfg *cfg)
5627 {
5628         struct mdt_device *m = mdt_dev(d);
5629         struct md_device *md_next = m->mdt_child;
5630         struct lu_device *next = md2lu_dev(md_next);
5631         int rc;
5632         ENTRY;
5633
5634         switch (cfg->lcfg_command) {
5635         case LCFG_PARAM: {
5636                 struct obd_device          *obd = d->ld_obd;
5637
5638                 /* For interoperability */
5639                 struct cfg_interop_param   *ptr = NULL;
5640                 struct lustre_cfg          *old_cfg = NULL;
5641                 char                       *param = NULL;
5642
5643                 param = lustre_cfg_string(cfg, 1);
5644                 if (param == NULL) {
5645                         CERROR("param is empty\n");
5646                         rc = -EINVAL;
5647                         break;
5648                 }
5649
5650                 ptr = class_find_old_param(param, mdt_interop_param);
5651                 if (ptr != NULL) {
5652                         if (ptr->new_param == NULL) {
5653                                 rc = 0;
5654                                 CWARN("For interoperability, skip this %s."
5655                                       " It is obsolete.\n", ptr->old_param);
5656                                 break;
5657                         }
5658
5659                         CWARN("Found old param %s, changed it to %s.\n",
5660                               ptr->old_param, ptr->new_param);
5661
5662                         old_cfg = cfg;
5663                         cfg = lustre_cfg_rename(old_cfg, ptr->new_param);
5664                         if (IS_ERR(cfg)) {
5665                                 rc = PTR_ERR(cfg);
5666                                 break;
5667                         }
5668                 }
5669
5670                 rc = class_process_proc_param(PARAM_MDT, obd->obd_vars,
5671                                               cfg, obd);
5672                 if (rc > 0 || rc == -ENOSYS) {
5673                         /* is it an HSM var ? */
5674                         rc = class_process_proc_param(PARAM_HSM,
5675                                                       hsm_cdt_get_proc_vars(),
5676                                                       cfg, obd);
5677                         if (rc > 0 || rc == -ENOSYS)
5678                                 /* we don't understand; pass it on */
5679                                 rc = next->ld_ops->ldo_process_config(env, next,
5680                                                                       cfg);
5681                 }
5682
5683                 if (old_cfg)
5684                         OBD_FREE(cfg, lustre_cfg_len(cfg->lcfg_bufcount,
5685                                                      cfg->lcfg_buflens));
5686                 break;
5687         }
5688         default:
5689                 /* others are passed further */
5690                 rc = next->ld_ops->ldo_process_config(env, next, cfg);
5691                 break;
5692         }
5693         RETURN(rc);
5694 }
5695
5696 static struct lu_object *mdt_object_alloc(const struct lu_env *env,
5697                                           const struct lu_object_header *hdr,
5698                                           struct lu_device *d)
5699 {
5700         struct mdt_object *mo;
5701
5702         ENTRY;
5703
5704         OBD_SLAB_ALLOC_PTR_GFP(mo, mdt_object_kmem, GFP_NOFS);
5705         if (mo != NULL) {
5706                 struct lu_object *o;
5707                 struct lu_object_header *h;
5708
5709                 o = &mo->mot_obj;
5710                 h = &mo->mot_header;
5711                 lu_object_header_init(h);
5712                 lu_object_init(o, h, d);
5713                 lu_object_add_top(h, o);
5714                 o->lo_ops = &mdt_obj_ops;
5715                 spin_lock_init(&mo->mot_write_lock);
5716                 mutex_init(&mo->mot_som_mutex);
5717                 mutex_init(&mo->mot_lov_mutex);
5718                 init_rwsem(&mo->mot_dom_sem);
5719                 init_rwsem(&mo->mot_open_sem);
5720                 atomic_set(&mo->mot_open_count, 0);
5721                 RETURN(o);
5722         }
5723         RETURN(NULL);
5724 }
5725
5726 static int mdt_object_init(const struct lu_env *env, struct lu_object *o,
5727                            const struct lu_object_conf *unused)
5728 {
5729         struct mdt_device *d = mdt_dev(o->lo_dev);
5730         struct lu_device  *under;
5731         struct lu_object  *below;
5732         int                rc = 0;
5733         ENTRY;
5734
5735         CDEBUG(D_INFO, "object init, fid = "DFID"\n",
5736                PFID(lu_object_fid(o)));
5737
5738         under = &d->mdt_child->md_lu_dev;
5739         below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under);
5740         if (below != NULL) {
5741                 lu_object_add(o, below);
5742         } else
5743                 rc = -ENOMEM;
5744
5745         RETURN(rc);
5746 }
5747
5748 static void mdt_object_free(const struct lu_env *env, struct lu_object *o)
5749 {
5750         struct mdt_object *mo = mdt_obj(o);
5751         struct lu_object_header *h;
5752         ENTRY;
5753
5754         h = o->lo_header;
5755         CDEBUG(D_INFO, "object free, fid = "DFID"\n",
5756                PFID(lu_object_fid(o)));
5757
5758         LASSERT(atomic_read(&mo->mot_open_count) == 0);
5759         LASSERT(atomic_read(&mo->mot_lease_count) == 0);
5760
5761         lu_object_fini(o);
5762         lu_object_header_fini(h);
5763         OBD_SLAB_FREE_PTR(mo, mdt_object_kmem);
5764
5765         EXIT;
5766 }
5767
5768 static int mdt_object_print(const struct lu_env *env, void *cookie,
5769                             lu_printer_t p, const struct lu_object *o)
5770 {
5771         struct mdt_object *mdto = mdt_obj((struct lu_object *)o);
5772
5773         return (*p)(env, cookie,
5774                     LUSTRE_MDT_NAME"-object@%p(%s %s, writecount=%d)",
5775                     mdto, mdto->mot_lov_created ? "lov_created" : "",
5776                     mdto->mot_cache_attr ? "cache_attr" : "",
5777                     mdto->mot_write_count);
5778 }
5779
5780 static int mdt_prepare(const struct lu_env *env,
5781                 struct lu_device *pdev,
5782                 struct lu_device *cdev)
5783 {
5784         struct mdt_device *mdt = mdt_dev(cdev);
5785         struct lu_device *next = &mdt->mdt_child->md_lu_dev;
5786         struct obd_device *obd = cdev->ld_obd;
5787         int rc;
5788
5789         ENTRY;
5790
5791         LASSERT(obd);
5792
5793         rc = next->ld_ops->ldo_prepare(env, cdev, next);
5794         if (rc)
5795                 RETURN(rc);
5796
5797         rc = mdt_llog_ctxt_clone(env, mdt, LLOG_CHANGELOG_ORIG_CTXT);
5798         if (rc)
5799                 RETURN(rc);
5800
5801         rc = mdt_llog_ctxt_clone(env, mdt, LLOG_AGENT_ORIG_CTXT);
5802         if (rc)
5803                 RETURN(rc);
5804
5805         rc = lfsck_register_namespace(env, mdt->mdt_bottom, mdt->mdt_namespace);
5806         /* The LFSCK instance is registered just now, so it must be there when
5807          * register the namespace to such instance. */
5808         LASSERTF(rc == 0, "register namespace failed: rc = %d\n", rc);
5809
5810         if (mdt->mdt_seq_site.ss_node_id == 0) {
5811                 rc = mdt->mdt_child->md_ops->mdo_root_get(env, mdt->mdt_child,
5812                                                          &mdt->mdt_md_root_fid);
5813                 if (rc)
5814                         RETURN(rc);
5815         }
5816
5817         LASSERT(!test_bit(MDT_FL_CFGLOG, &mdt->mdt_state));
5818
5819         target_recovery_init(&mdt->mdt_lut, tgt_request_handle);
5820         set_bit(MDT_FL_CFGLOG, &mdt->mdt_state);
5821         LASSERT(obd->obd_no_conn);
5822         spin_lock(&obd->obd_dev_lock);
5823         obd->obd_no_conn = 0;
5824         spin_unlock(&obd->obd_dev_lock);
5825
5826         if (obd->obd_recovering == 0)
5827                 mdt_postrecov(env, mdt);
5828
5829         RETURN(rc);
5830 }
5831
5832 const struct lu_device_operations mdt_lu_ops = {
5833         .ldo_object_alloc   = mdt_object_alloc,
5834         .ldo_process_config = mdt_process_config,
5835         .ldo_prepare        = mdt_prepare,
5836 };
5837
5838 static const struct lu_object_operations mdt_obj_ops = {
5839         .loo_object_init    = mdt_object_init,
5840         .loo_object_free    = mdt_object_free,
5841         .loo_object_print   = mdt_object_print
5842 };
5843
5844 static int mdt_obd_set_info_async(const struct lu_env *env,
5845                                   struct obd_export *exp,
5846                                   __u32 keylen, void *key,
5847                                   __u32 vallen, void *val,
5848                                   struct ptlrpc_request_set *set)
5849 {
5850         int rc;
5851
5852         ENTRY;
5853
5854         if (KEY_IS(KEY_SPTLRPC_CONF)) {
5855                 rc = tgt_adapt_sptlrpc_conf(class_exp2tgt(exp));
5856                 RETURN(rc);
5857         }
5858
5859         RETURN(0);
5860 }
5861
5862 /**
5863  * Match client and server connection feature flags.
5864  *
5865  * Compute the compatibility flags for a connection request based on
5866  * features mutually supported by client and server.
5867  *
5868  * The obd_export::exp_connect_data.ocd_connect_flags field in \a exp
5869  * must not be updated here, otherwise a partially initialized value may
5870  * be exposed. After the connection request is successfully processed,
5871  * the top-level MDT connect request handler atomically updates the export
5872  * connect flags from the obd_connect_data::ocd_connect_flags field of the
5873  * reply. \see mdt_connect().
5874  *
5875  * Before 2.7.50 clients will send a struct obd_connect_data_v1 rather than a
5876  * full struct obd_connect_data. So care must be taken when accessing fields
5877  * that are not present in struct obd_connect_data_v1. See LU-16.
5878  *
5879  * \param exp   the obd_export associated with this client/target pair
5880  * \param mdt   the target device for the connection
5881  * \param data  stores data for this connect request
5882  *
5883  * \retval 0       success
5884  * \retval -EPROTO \a data unexpectedly has zero obd_connect_data::ocd_brw_size
5885  * \retval -EBADE  client and server feature requirements are incompatible
5886  */
5887 static int mdt_connect_internal(const struct lu_env *env,
5888                                 struct obd_export *exp,
5889                                 struct mdt_device *mdt,
5890                                 struct obd_connect_data *data, bool reconnect)
5891 {
5892         const char *obd_name = mdt_obd_name(mdt);
5893         LASSERT(data != NULL);
5894
5895         data->ocd_connect_flags &= MDT_CONNECT_SUPPORTED;
5896
5897         if (mdt->mdt_bottom->dd_rdonly &&
5898             !(data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
5899             !(data->ocd_connect_flags & OBD_CONNECT_RDONLY))
5900                 RETURN(-EACCES);
5901
5902         if (data->ocd_connect_flags & OBD_CONNECT_FLAGS2)
5903                 data->ocd_connect_flags2 &= MDT_CONNECT_SUPPORTED2;
5904
5905         data->ocd_ibits_known &= MDS_INODELOCK_FULL;
5906
5907         if (!mdt->mdt_opts.mo_acl)
5908                 data->ocd_connect_flags &= ~OBD_CONNECT_ACL;
5909
5910         if (!mdt->mdt_opts.mo_user_xattr)
5911                 data->ocd_connect_flags &= ~OBD_CONNECT_XATTR;
5912
5913         if (OCD_HAS_FLAG(data, BRW_SIZE)) {
5914                 data->ocd_brw_size = min(data->ocd_brw_size,
5915                                          mdt->mdt_brw_size);
5916                 if (data->ocd_brw_size == 0) {
5917                         CERROR("%s: cli %s/%p ocd_connect_flags: %#llx "
5918                                "ocd_version: %x ocd_grant: %d ocd_index: %u "
5919                                "ocd_brw_size unexpectedly zero, network data "
5920                                "corruption? Refusing to connect this client\n",
5921                                obd_name, exp->exp_client_uuid.uuid,
5922                                exp, data->ocd_connect_flags, data->ocd_version,
5923                                data->ocd_grant, data->ocd_index);
5924                         return -EPROTO;
5925                 }
5926         }
5927
5928         if (OCD_HAS_FLAG(data, GRANT_PARAM)) {
5929                 struct dt_device_param *ddp = &mdt->mdt_lut.lut_dt_conf;
5930
5931                 /* client is reporting its page size, for future use */
5932                 exp->exp_target_data.ted_pagebits = data->ocd_grant_blkbits;
5933                 data->ocd_grant_blkbits  = mdt->mdt_lut.lut_tgd.tgd_blockbits;
5934                 /* ddp_inodespace may not be power-of-two value, eg. for ldiskfs
5935                  * it's LDISKFS_DIR_REC_LEN(20) = 28. */
5936                 data->ocd_grant_inobits = fls(ddp->ddp_inodespace - 1);
5937                 /* ocd_grant_tax_kb is in 1K byte blocks */
5938                 data->ocd_grant_tax_kb = ddp->ddp_extent_tax >> 10;
5939                 data->ocd_grant_max_blks = ddp->ddp_max_extent_blks;
5940         }
5941
5942         /* Save connect_data we have so far because tgt_grant_connect()
5943          * uses it to calculate grant, and we want to save the client
5944          * version before it is overwritten by LUSTRE_VERSION_CODE. */
5945         exp->exp_connect_data = *data;
5946         if (OCD_HAS_FLAG(data, GRANT))
5947                 tgt_grant_connect(env, exp, data, !reconnect);
5948
5949         if (OCD_HAS_FLAG(data, MAXBYTES))
5950                 data->ocd_maxbytes = mdt->mdt_lut.lut_dt_conf.ddp_maxbytes;
5951
5952         /* NB: Disregard the rule against updating
5953          * exp_connect_data.ocd_connect_flags in this case, since
5954          * tgt_client_new() needs to know if this is a lightweight
5955          * connection, and it is safe to expose this flag before
5956          * connection processing completes. */
5957         if (data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) {
5958                 spin_lock(&exp->exp_lock);
5959                 *exp_connect_flags_ptr(exp) |= OBD_CONNECT_LIGHTWEIGHT;
5960                 spin_unlock(&exp->exp_lock);
5961         }
5962
5963         data->ocd_version = LUSTRE_VERSION_CODE;
5964
5965         if ((data->ocd_connect_flags & OBD_CONNECT_FID) == 0) {
5966                 CWARN("%s: MDS requires FID support, but client not\n",
5967                       obd_name);
5968                 return -EBADE;
5969         }
5970
5971         if (OCD_HAS_FLAG(data, PINGLESS)) {
5972                 if (ptlrpc_pinger_suppress_pings()) {
5973                         spin_lock(&exp->exp_obd->obd_dev_lock);
5974                         list_del_init(&exp->exp_obd_chain_timed);
5975                         spin_unlock(&exp->exp_obd->obd_dev_lock);
5976                 } else {
5977                         data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS;
5978                 }
5979         }
5980
5981         data->ocd_max_easize = mdt->mdt_max_ea_size;
5982
5983         /* NB: Disregard the rule against updating
5984          * exp_connect_data.ocd_connect_flags in this case, since
5985          * tgt_client_new() needs to know if this is client supports
5986          * multiple modify RPCs, and it is safe to expose this flag before
5987          * connection processing completes. */
5988         if (data->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) {
5989                 data->ocd_maxmodrpcs = max_mod_rpcs_per_client;
5990                 spin_lock(&exp->exp_lock);
5991                 *exp_connect_flags_ptr(exp) |= OBD_CONNECT_MULTIMODRPCS;
5992                 spin_unlock(&exp->exp_lock);
5993         }
5994
5995         if (OCD_HAS_FLAG(data, CKSUM)) {
5996                 __u32 cksum_types = data->ocd_cksum_types;
5997
5998                 /* The client set in ocd_cksum_types the checksum types it
5999                  * supports. We have to mask off the algorithms that we don't
6000                  * support */
6001                 data->ocd_cksum_types &=
6002                         obd_cksum_types_supported_server(obd_name);
6003
6004                 if (unlikely(data->ocd_cksum_types == 0)) {
6005                         CERROR("%s: Connect with checksum support but no "
6006                                "ocd_cksum_types is set\n",
6007                                exp->exp_obd->obd_name);
6008                         RETURN(-EPROTO);
6009                 }
6010
6011                 CDEBUG(D_RPCTRACE, "%s: cli %s supports cksum type %x, return "
6012                        "%x\n", exp->exp_obd->obd_name, obd_export_nid2str(exp),
6013                        cksum_types, data->ocd_cksum_types);
6014         } else {
6015                 /* This client does not support OBD_CONNECT_CKSUM
6016                  * fall back to CRC32 */
6017                 CDEBUG(D_RPCTRACE, "%s: cli %s does not support "
6018                        "OBD_CONNECT_CKSUM, CRC32 will be used\n",
6019                        exp->exp_obd->obd_name, obd_export_nid2str(exp));
6020         }
6021
6022         return 0;
6023 }
6024
6025 static int mdt_ctxt_add_dirty_flag(struct lu_env *env,
6026                                    struct mdt_thread_info *info,
6027                                    struct mdt_file_data *mfd)
6028 {
6029         struct lu_context ses;
6030         int rc;
6031         ENTRY;
6032
6033         rc = lu_context_init(&ses, LCT_SERVER_SESSION);
6034         if (rc)
6035                 RETURN(rc);
6036
6037         env->le_ses = &ses;
6038         lu_context_enter(&ses);
6039
6040         mdt_ucred(info)->uc_valid = UCRED_OLD;
6041         rc = mdt_add_dirty_flag(info, mfd->mfd_object, &info->mti_attr);
6042
6043         lu_context_exit(&ses);
6044         lu_context_fini(&ses);
6045         env->le_ses = NULL;
6046
6047         RETURN(rc);
6048 }
6049
6050 static int mdt_export_cleanup(struct obd_export *exp)
6051 {
6052         struct list_head         closing_list;
6053         struct mdt_export_data  *med = &exp->exp_mdt_data;
6054         struct obd_device       *obd = exp->exp_obd;
6055         struct mdt_device       *mdt;
6056         struct mdt_thread_info  *info;
6057         struct lu_env            env;
6058         struct mdt_file_data    *mfd, *n;
6059         int rc = 0;
6060         ENTRY;
6061
6062         INIT_LIST_HEAD(&closing_list);
6063         spin_lock(&med->med_open_lock);
6064         while (!list_empty(&med->med_open_head)) {
6065                 struct list_head *tmp = med->med_open_head.next;
6066                 mfd = list_entry(tmp, struct mdt_file_data, mfd_list);
6067
6068                 /* Remove mfd handle so it can't be found again.
6069                  * We are consuming the mfd_list reference here. */
6070                 class_handle_unhash(&mfd->mfd_open_handle);
6071                 list_move_tail(&mfd->mfd_list, &closing_list);
6072         }
6073         spin_unlock(&med->med_open_lock);
6074         mdt = mdt_dev(obd->obd_lu_dev);
6075         LASSERT(mdt != NULL);
6076
6077         rc = lu_env_init(&env, LCT_MD_THREAD);
6078         if (rc)
6079                 RETURN(rc);
6080
6081         info = lu_context_key_get(&env.le_ctx, &mdt_thread_key);
6082         LASSERT(info != NULL);
6083         memset(info, 0, sizeof *info);
6084         info->mti_env = &env;
6085         info->mti_mdt = mdt;
6086         info->mti_exp = exp;
6087
6088         if (!list_empty(&closing_list)) {
6089                 struct md_attr *ma = &info->mti_attr;
6090
6091                 /* Close any open files (which may also cause orphan
6092                  * unlinking). */
6093                 list_for_each_entry_safe(mfd, n, &closing_list, mfd_list) {
6094                         list_del_init(&mfd->mfd_list);
6095                         ma->ma_need = ma->ma_valid = 0;
6096
6097                         /* This file is being closed due to an eviction, it
6098                          * could have been modified and now dirty regarding to
6099                          * HSM archive, check this!
6100                          * The logic here is to mark a file dirty if there's a
6101                          * chance it was dirtied before the client was evicted,
6102                          * so that we don't have to wait for a release attempt
6103                          * before finding out the file was actually dirty and
6104                          * fail the release. Aggressively marking it dirty here
6105                          * will cause the policy engine to attempt to
6106                          * re-archive it; when rearchiving, we can compare the
6107                          * current version to the HSM data_version and make the
6108                          * archive request into a noop if it's not actually
6109                          * dirty.
6110                          */
6111                         if (mfd->mfd_open_flags & MDS_FMODE_WRITE)
6112                                 rc = mdt_ctxt_add_dirty_flag(&env, info, mfd);
6113
6114                         /* Don't unlink orphan on failover umount, LU-184 */
6115                         if (exp->exp_flags & OBD_OPT_FAILOVER) {
6116                                 ma->ma_valid = MA_FLAGS;
6117                                 ma->ma_attr_flags |= MDS_KEEP_ORPHAN;
6118                         }
6119                         mdt_mfd_close(info, mfd);
6120                 }
6121         }
6122         info->mti_mdt = NULL;
6123         /* cleanup client slot early */
6124         /* Do not erase record for recoverable client. */
6125         if (!(exp->exp_flags & OBD_OPT_FAILOVER) || exp->exp_failed)
6126                 tgt_client_del(&env, exp);
6127         lu_env_fini(&env);
6128
6129         RETURN(rc);
6130 }
6131
6132 static inline void mdt_enable_slc(struct mdt_device *mdt)
6133 {
6134         if (mdt->mdt_lut.lut_sync_lock_cancel == SYNC_LOCK_CANCEL_NEVER)
6135                 mdt->mdt_lut.lut_sync_lock_cancel = SYNC_LOCK_CANCEL_BLOCKING;
6136 }
6137
6138 static inline void mdt_disable_slc(struct mdt_device *mdt)
6139 {
6140         if (mdt->mdt_lut.lut_sync_lock_cancel == SYNC_LOCK_CANCEL_BLOCKING)
6141                 mdt->mdt_lut.lut_sync_lock_cancel = SYNC_LOCK_CANCEL_NEVER;
6142 }
6143
6144 static int mdt_obd_disconnect(struct obd_export *exp)
6145 {
6146         int rc;
6147
6148         ENTRY;
6149
6150         LASSERT(exp);
6151         class_export_get(exp);
6152
6153         if (!(exp->exp_flags & OBD_OPT_FORCE))
6154                 tgt_grant_sanity_check(exp->exp_obd, __func__);
6155
6156         if ((exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) &&
6157             !(exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)) {
6158                 struct mdt_device *mdt = mdt_dev(exp->exp_obd->obd_lu_dev);
6159
6160                 if (atomic_dec_and_test(&mdt->mdt_mds_mds_conns))
6161                         mdt_disable_slc(mdt);
6162         }
6163
6164         rc = server_disconnect_export(exp);
6165         if (rc != 0)
6166                 CDEBUG(D_IOCTL, "server disconnect error: rc = %d\n", rc);
6167
6168         tgt_grant_discard(exp);
6169
6170         rc = mdt_export_cleanup(exp);
6171         nodemap_del_member(exp);
6172         class_export_put(exp);
6173         RETURN(rc);
6174 }
6175
6176 /* mds_connect copy */
6177 static int mdt_obd_connect(const struct lu_env *env,
6178                            struct obd_export **exp, struct obd_device *obd,
6179                            struct obd_uuid *cluuid,
6180                            struct obd_connect_data *data,
6181                            void *localdata)
6182 {
6183         struct obd_export       *lexp;
6184         struct lustre_handle    conn = { 0 };
6185         struct mdt_device       *mdt;
6186         int                      rc;
6187         lnet_nid_t              *client_nid = localdata;
6188         ENTRY;
6189
6190         LASSERT(env != NULL);
6191         LASSERT(data != NULL);
6192
6193         if (!exp || !obd || !cluuid)
6194                 RETURN(-EINVAL);
6195
6196         mdt = mdt_dev(obd->obd_lu_dev);
6197
6198         if ((data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) &&
6199             !(data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT)) {
6200                 atomic_inc(&mdt->mdt_mds_mds_conns);
6201                 mdt_enable_slc(mdt);
6202         }
6203
6204         /*
6205          * first, check whether the stack is ready to handle requests
6206          * XXX: probably not very appropriate method is used now
6207          *      at some point we should find a better one
6208          */
6209         if (!test_bit(MDT_FL_SYNCED, &mdt->mdt_state) &&
6210             !(data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) &&
6211             !(data->ocd_connect_flags & OBD_CONNECT_MDS_MDS)) {
6212                 rc = obd_get_info(env, mdt->mdt_child_exp,
6213                                   sizeof(KEY_OSP_CONNECTED),
6214                                   KEY_OSP_CONNECTED, NULL, NULL);
6215                 if (rc)
6216                         RETURN(-EAGAIN);
6217                 set_bit(MDT_FL_SYNCED, &mdt->mdt_state);
6218         }
6219
6220         rc = class_connect(&conn, obd, cluuid);
6221         if (rc)
6222                 RETURN(rc);
6223
6224         lexp = class_conn2export(&conn);
6225         LASSERT(lexp != NULL);
6226
6227         rc = nodemap_add_member(*client_nid, lexp);
6228         if (rc != 0 && rc != -EEXIST)
6229                 GOTO(out, rc);
6230
6231         rc = mdt_connect_internal(env, lexp, mdt, data, false);
6232         if (rc == 0) {
6233                 struct lsd_client_data *lcd = lexp->exp_target_data.ted_lcd;
6234
6235                 LASSERT(lcd);
6236                 memcpy(lcd->lcd_uuid, cluuid, sizeof lcd->lcd_uuid);
6237                 rc = tgt_client_new(env, lexp);
6238                 if (rc == 0)
6239                         mdt_export_stats_init(obd, lexp, localdata);
6240         }
6241 out:
6242         if (rc != 0) {
6243                 class_disconnect(lexp);
6244                 nodemap_del_member(lexp);
6245                 *exp = NULL;
6246         } else {
6247                 *exp = lexp;
6248                 /* Because we do not want this export to be evicted by pinger,
6249                  * let's not add this export to the timed chain list. */
6250                 if (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) {
6251                         spin_lock(&lexp->exp_obd->obd_dev_lock);
6252                         list_del_init(&lexp->exp_obd_chain_timed);
6253                         spin_unlock(&lexp->exp_obd->obd_dev_lock);
6254                 }
6255         }
6256
6257         RETURN(rc);
6258 }
6259
6260 static int mdt_obd_reconnect(const struct lu_env *env,
6261                              struct obd_export *exp, struct obd_device *obd,
6262                              struct obd_uuid *cluuid,
6263                              struct obd_connect_data *data,
6264                              void *localdata)
6265 {
6266         lnet_nid_t             *client_nid = localdata;
6267         int                     rc;
6268         ENTRY;
6269
6270         if (exp == NULL || obd == NULL || cluuid == NULL)
6271                 RETURN(-EINVAL);
6272
6273         rc = nodemap_add_member(*client_nid, exp);
6274         if (rc != 0 && rc != -EEXIST)
6275                 RETURN(rc);
6276
6277         rc = mdt_connect_internal(env, exp, mdt_dev(obd->obd_lu_dev), data,
6278                                   true);
6279         if (rc == 0)
6280                 mdt_export_stats_init(obd, exp, localdata);
6281         else
6282                 nodemap_del_member(exp);
6283
6284         RETURN(rc);
6285 }
6286
6287 /* FIXME: Can we avoid using these two interfaces? */
6288 static int mdt_init_export(struct obd_export *exp)
6289 {
6290         struct mdt_export_data *med = &exp->exp_mdt_data;
6291         int                     rc;
6292         ENTRY;
6293
6294         INIT_LIST_HEAD(&med->med_open_head);
6295         spin_lock_init(&med->med_open_lock);
6296         spin_lock(&exp->exp_lock);
6297         exp->exp_connecting = 1;
6298         spin_unlock(&exp->exp_lock);
6299
6300         /* self-export doesn't need client data and ldlm initialization */
6301         if (unlikely(obd_uuid_equals(&exp->exp_obd->obd_uuid,
6302                                      &exp->exp_client_uuid)))
6303                 RETURN(0);
6304
6305         rc = tgt_client_alloc(exp);
6306         if (rc)
6307                 GOTO(err, rc);
6308
6309         rc = ldlm_init_export(exp);
6310         if (rc)
6311                 GOTO(err_free, rc);
6312
6313         RETURN(rc);
6314
6315 err_free:
6316         tgt_client_free(exp);
6317 err:
6318         CERROR("%s: Failed to initialize export: rc = %d\n",
6319                exp->exp_obd->obd_name, rc);
6320         return rc;
6321 }
6322
6323 static int mdt_destroy_export(struct obd_export *exp)
6324 {
6325         ENTRY;
6326
6327         target_destroy_export(exp);
6328         /* destroy can be called from failed obd_setup, so
6329          * checking uuid is safer than obd_self_export */
6330         if (unlikely(obd_uuid_equals(&exp->exp_obd->obd_uuid,
6331                                      &exp->exp_client_uuid)))
6332                 RETURN(0);
6333
6334         ldlm_destroy_export(exp);
6335         tgt_client_free(exp);
6336
6337         LASSERT(list_empty(&exp->exp_outstanding_replies));
6338         LASSERT(list_empty(&exp->exp_mdt_data.med_open_head));
6339
6340         /*
6341          * discard grants once we're sure no more
6342          * interaction with the client is possible
6343          */
6344         tgt_grant_discard(exp);
6345         if (exp_connect_flags(exp) & OBD_CONNECT_GRANT)
6346                 exp->exp_obd->u.obt.obt_lut->lut_tgd.tgd_tot_granted_clients--;
6347
6348         if (!(exp->exp_flags & OBD_OPT_FORCE))
6349                 tgt_grant_sanity_check(exp->exp_obd, __func__);
6350
6351         RETURN(0);
6352 }
6353
6354 int mdt_links_read(struct mdt_thread_info *info, struct mdt_object *mdt_obj,
6355                    struct linkea_data *ldata)
6356 {
6357         int rc;
6358
6359         LASSERT(ldata->ld_buf->lb_buf != NULL);
6360
6361         if (!mdt_object_exists(mdt_obj))
6362                 return -ENODATA;
6363
6364         rc = mo_xattr_get(info->mti_env, mdt_object_child(mdt_obj),
6365                           ldata->ld_buf, XATTR_NAME_LINK);
6366         if (rc == -ERANGE) {
6367                 /* Buf was too small, figure out what we need. */
6368                 lu_buf_free(ldata->ld_buf);
6369                 rc = mo_xattr_get(info->mti_env, mdt_object_child(mdt_obj),
6370                                   ldata->ld_buf, XATTR_NAME_LINK);
6371                 if (rc < 0)
6372                         return rc;
6373                 ldata->ld_buf = lu_buf_check_and_alloc(ldata->ld_buf, rc);
6374                 if (ldata->ld_buf->lb_buf == NULL)
6375                         return -ENOMEM;
6376                 rc = mo_xattr_get(info->mti_env, mdt_object_child(mdt_obj),
6377                                   ldata->ld_buf, XATTR_NAME_LINK);
6378         }
6379         if (rc < 0)
6380                 return rc;
6381
6382         return linkea_init_with_rec(ldata);
6383 }
6384
6385 /**
6386  * Given an MDT object, try to look up the full path to the object.
6387  * Part of the MDT layer implementation of lfs fid2path.
6388  *
6389  * \param[in]     info  Per-thread common data shared by MDT level handlers.
6390  * \param[in]     obj   Object to do path lookup of
6391  * \param[in,out] fp    User-provided struct to store path information
6392  * \param[in]     root_fid Root FID of current path should reach
6393  *
6394  * \retval 0 Lookup successful, path information stored in fp
6395  * \retval -EAGAIN Lookup failed, usually because object is being moved
6396  * \retval negative errno if there was a problem
6397  */
6398 static int mdt_path_current(struct mdt_thread_info *info,
6399                             struct mdt_object *obj,
6400                             struct getinfo_fid2path *fp,
6401                             struct lu_fid *root_fid)
6402 {
6403         struct mdt_device       *mdt = info->mti_mdt;
6404         struct mdt_object       *mdt_obj;
6405         struct link_ea_header   *leh;
6406         struct link_ea_entry    *lee;
6407         struct lu_name          *tmpname = &info->mti_name;
6408         struct lu_fid           *tmpfid = &info->mti_tmp_fid1;
6409         struct lu_buf           *buf = &info->mti_big_buf;
6410         char                    *ptr;
6411         int                     reclen;
6412         struct linkea_data      ldata = { NULL };
6413         int                     rc = 0;
6414         bool                    first = true;
6415         ENTRY;
6416
6417         /* temp buffer for path element, the buffer will be finally freed
6418          * in mdt_thread_info_fini */
6419         buf = lu_buf_check_and_alloc(buf, PATH_MAX);
6420         if (buf->lb_buf == NULL)
6421                 RETURN(-ENOMEM);
6422
6423         ldata.ld_buf = buf;
6424         ptr = fp->gf_u.gf_path + fp->gf_pathlen - 1;
6425         *ptr = 0;
6426         --ptr;
6427         *tmpfid = fp->gf_fid = *mdt_object_fid(obj);
6428
6429         while (!lu_fid_eq(root_fid, &fp->gf_fid)) {
6430                 struct lu_buf           lmv_buf;
6431
6432                 if (!lu_fid_eq(root_fid, &mdt->mdt_md_root_fid) &&
6433                     lu_fid_eq(&mdt->mdt_md_root_fid, &fp->gf_fid))
6434                         GOTO(out, rc = -ENOENT);
6435
6436                 if (lu_fid_eq(mdt_object_fid(obj), tmpfid)) {
6437                         mdt_obj = obj;
6438                         mdt_object_get(info->mti_env, mdt_obj);
6439                 } else {
6440                         mdt_obj = mdt_object_find(info->mti_env, mdt, tmpfid);
6441                         if (IS_ERR(mdt_obj))
6442                                 GOTO(out, rc = PTR_ERR(mdt_obj));
6443                 }
6444
6445                 if (!mdt_object_exists(mdt_obj)) {
6446                         mdt_object_put(info->mti_env, mdt_obj);
6447                         GOTO(out, rc = -ENOENT);
6448                 }
6449
6450                 if (mdt_object_remote(mdt_obj)) {
6451                         mdt_object_put(info->mti_env, mdt_obj);
6452                         GOTO(remote_out, rc = -EREMOTE);
6453                 }
6454
6455                 rc = mdt_links_read(info, mdt_obj, &ldata);
6456                 if (rc != 0) {
6457                         mdt_object_put(info->mti_env, mdt_obj);
6458                         GOTO(out, rc);
6459                 }
6460
6461                 leh = buf->lb_buf;
6462                 lee = (struct link_ea_entry *)(leh + 1); /* link #0 */
6463                 linkea_entry_unpack(lee, &reclen, tmpname, tmpfid);
6464                 /* If set, use link #linkno for path lookup, otherwise use
6465                    link #0.  Only do this for the final path element. */
6466                 if (first && fp->gf_linkno < leh->leh_reccount) {
6467                         int count;
6468                         for (count = 0; count < fp->gf_linkno; count++) {
6469                                 lee = (struct link_ea_entry *)
6470                                      ((char *)lee + reclen);
6471                                 linkea_entry_unpack(lee, &reclen, tmpname,
6472                                                     tmpfid);
6473                         }
6474                         if (fp->gf_linkno < leh->leh_reccount - 1)
6475                                 /* indicate to user there are more links */
6476                                 fp->gf_linkno++;
6477                 }
6478
6479                 lmv_buf.lb_buf = info->mti_xattr_buf;
6480                 lmv_buf.lb_len = sizeof(info->mti_xattr_buf);
6481                 /* Check if it is slave stripes */
6482                 rc = mo_xattr_get(info->mti_env, mdt_object_child(mdt_obj),
6483                                   &lmv_buf, XATTR_NAME_LMV);
6484                 mdt_object_put(info->mti_env, mdt_obj);
6485                 if (rc > 0) {
6486                         union lmv_mds_md *lmm = lmv_buf.lb_buf;
6487
6488                         /* For slave stripes, get its master */
6489                         if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_STRIPE) {
6490                                 fp->gf_fid = *tmpfid;
6491                                 continue;
6492                         }
6493                 } else if (rc < 0 && rc != -ENODATA) {
6494                         GOTO(out, rc);
6495                 }
6496
6497                 rc = 0;
6498
6499                 /* Pack the name in the end of the buffer */
6500                 ptr -= tmpname->ln_namelen;
6501                 if (ptr - 1 <= fp->gf_u.gf_path)
6502                         GOTO(out, rc = -EOVERFLOW);
6503                 strncpy(ptr, tmpname->ln_name, tmpname->ln_namelen);
6504                 *(--ptr) = '/';
6505
6506                 /* keep the last resolved fid to the client, so the
6507                  * client will build the left path on another MDT for
6508                  * remote object */
6509                 fp->gf_fid = *tmpfid;
6510
6511                 first = false;
6512         }
6513
6514 remote_out:
6515         ptr++; /* skip leading / */
6516         memmove(fp->gf_u.gf_path, ptr,
6517                 fp->gf_u.gf_path + fp->gf_pathlen - ptr);
6518
6519 out:
6520         RETURN(rc);
6521 }
6522
6523 /**
6524  * Given an MDT object, use mdt_path_current to get the path.
6525  * Essentially a wrapper to retry mdt_path_current a set number of times
6526  * if -EAGAIN is returned (usually because an object is being moved).
6527  *
6528  * Part of the MDT layer implementation of lfs fid2path.
6529  *
6530  * \param[in]     info  Per-thread common data shared by mdt level handlers.
6531  * \param[in]     obj   Object to do path lookup of
6532  * \param[in,out] fp    User-provided struct for arguments and to store path
6533  *                      information
6534  *
6535  * \retval 0 Lookup successful, path information stored in fp
6536  * \retval negative errno if there was a problem
6537  */
6538 static int mdt_path(struct mdt_thread_info *info, struct mdt_object *obj,
6539                     struct getinfo_fid2path *fp, struct lu_fid *root_fid)
6540 {
6541         struct mdt_device       *mdt = info->mti_mdt;
6542         int                     tries = 3;
6543         int                     rc = -EAGAIN;
6544         ENTRY;
6545
6546         if (fp->gf_pathlen < 3)
6547                 RETURN(-EOVERFLOW);
6548
6549         if (root_fid == NULL)
6550                 root_fid = &mdt->mdt_md_root_fid;
6551
6552         if (lu_fid_eq(root_fid, mdt_object_fid(obj))) {
6553                 fp->gf_u.gf_path[0] = '\0';
6554                 RETURN(0);
6555         }
6556
6557         /* Retry multiple times in case file is being moved */
6558         while (tries-- && rc == -EAGAIN)
6559                 rc = mdt_path_current(info, obj, fp, root_fid);
6560
6561         RETURN(rc);
6562 }
6563
6564 /**
6565  * Get the full path of the provided FID, as of changelog record recno.
6566  *
6567  * This checks sanity and looks up object for user provided FID
6568  * before calling the actual path lookup code.
6569  *
6570  * Part of the MDT layer implementation of lfs fid2path.
6571  *
6572  * \param[in]     info  Per-thread common data shared by mdt level handlers.
6573  * \param[in,out] fp    User-provided struct for arguments and to store path
6574  *                      information
6575  *
6576  * \retval 0 Lookup successful, path information and recno stored in fp
6577  * \retval -ENOENT, object does not exist
6578  * \retval negative errno if there was a problem
6579  */
6580 static int mdt_fid2path(struct mdt_thread_info *info,
6581                         struct lu_fid *root_fid,
6582                         struct getinfo_fid2path *fp)
6583 {
6584         struct mdt_device *mdt = info->mti_mdt;
6585         struct mdt_object *obj;
6586         int    rc;
6587         ENTRY;
6588
6589         CDEBUG(D_IOCTL, "path get "DFID" from %llu #%d\n",
6590                 PFID(&fp->gf_fid), fp->gf_recno, fp->gf_linkno);
6591
6592         if (!fid_is_sane(&fp->gf_fid))
6593                 RETURN(-EINVAL);
6594
6595         if (!fid_is_namespace_visible(&fp->gf_fid)) {
6596                 CDEBUG(D_INFO, "%s: "DFID" is invalid, f_seq should be >= %#llx"
6597                        ", or f_oid != 0, or f_ver == 0\n", mdt_obd_name(mdt),
6598                        PFID(&fp->gf_fid), (__u64)FID_SEQ_NORMAL);
6599                 RETURN(-EINVAL);
6600         }
6601
6602         obj = mdt_object_find(info->mti_env, mdt, &fp->gf_fid);
6603         if (IS_ERR(obj)) {
6604                 rc = PTR_ERR(obj);
6605                 CDEBUG(D_IOCTL, "cannot find "DFID": rc = %d\n",
6606                        PFID(&fp->gf_fid), rc);
6607                 RETURN(rc);
6608         }
6609
6610         if (mdt_object_remote(obj))
6611                 rc = -EREMOTE;
6612         else if (!mdt_object_exists(obj))
6613                 rc = -ENOENT;
6614         else
6615                 rc = 0;
6616
6617         if (rc < 0) {
6618                 mdt_object_put(info->mti_env, obj);
6619                 CDEBUG(D_IOCTL, "nonlocal object "DFID": rc = %d\n",
6620                        PFID(&fp->gf_fid), rc);
6621                 RETURN(rc);
6622         }
6623
6624         rc = mdt_path(info, obj, fp, root_fid);
6625
6626         CDEBUG(D_INFO, "fid "DFID", path %s recno %#llx linkno %u\n",
6627                PFID(&fp->gf_fid), fp->gf_u.gf_path,
6628                fp->gf_recno, fp->gf_linkno);
6629
6630         mdt_object_put(info->mti_env, obj);
6631
6632         RETURN(rc);
6633 }
6634
6635 static int mdt_rpc_fid2path(struct mdt_thread_info *info, void *key, int keylen,
6636                             void *val, int vallen)
6637 {
6638         struct getinfo_fid2path *fpout, *fpin;
6639         struct lu_fid *root_fid = NULL;
6640         int rc = 0;
6641
6642         fpin = key + cfs_size_round(sizeof(KEY_FID2PATH));
6643         fpout = val;
6644
6645         if (ptlrpc_req_need_swab(info->mti_pill->rc_req))
6646                 lustre_swab_fid2path(fpin);
6647
6648         memcpy(fpout, fpin, sizeof(*fpin));
6649         if (fpout->gf_pathlen != vallen - sizeof(*fpin))
6650                 RETURN(-EINVAL);
6651
6652         if (keylen >= cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*fpin) +
6653                       sizeof(struct lu_fid)) {
6654                 /* client sent its root FID, which is normally fileset FID */
6655                 root_fid = fpin->gf_u.gf_root_fid;
6656                 if (ptlrpc_req_need_swab(info->mti_pill->rc_req))
6657                         lustre_swab_lu_fid(root_fid);
6658
6659                 if (root_fid != NULL && !fid_is_sane(root_fid))
6660                         RETURN(-EINVAL);
6661         }
6662
6663         rc = mdt_fid2path(info, root_fid, fpout);
6664         RETURN(rc);
6665 }
6666
6667 int mdt_get_info(struct tgt_session_info *tsi)
6668 {
6669         char    *key;
6670         int      keylen;
6671         __u32   *vallen;
6672         void    *valout;
6673         int      rc;
6674
6675         ENTRY;
6676
6677         key = req_capsule_client_get(tsi->tsi_pill, &RMF_GETINFO_KEY);
6678         if (key == NULL) {
6679                 CDEBUG(D_IOCTL, "No GETINFO key\n");
6680                 RETURN(err_serious(-EFAULT));
6681         }
6682         keylen = req_capsule_get_size(tsi->tsi_pill, &RMF_GETINFO_KEY,
6683                                       RCL_CLIENT);
6684
6685         vallen = req_capsule_client_get(tsi->tsi_pill, &RMF_GETINFO_VALLEN);
6686         if (vallen == NULL) {
6687                 CDEBUG(D_IOCTL, "%s: cannot get RMF_GETINFO_VALLEN buffer\n",
6688                                 tgt_name(tsi->tsi_tgt));
6689                 RETURN(err_serious(-EFAULT));
6690         }
6691
6692         req_capsule_set_size(tsi->tsi_pill, &RMF_GETINFO_VAL, RCL_SERVER,
6693                              *vallen);
6694         rc = req_capsule_server_pack(tsi->tsi_pill);
6695         if (rc)
6696                 RETURN(err_serious(rc));
6697
6698         valout = req_capsule_server_get(tsi->tsi_pill, &RMF_GETINFO_VAL);
6699         if (valout == NULL) {
6700                 CDEBUG(D_IOCTL, "%s: cannot get get-info RPC out buffer\n",
6701                                 tgt_name(tsi->tsi_tgt));
6702                 RETURN(err_serious(-EFAULT));
6703         }
6704
6705         if (KEY_IS(KEY_FID2PATH)) {
6706                 struct mdt_thread_info  *info = tsi2mdt_info(tsi);
6707
6708                 rc = mdt_rpc_fid2path(info, key, keylen, valout, *vallen);
6709                 mdt_thread_info_fini(info);
6710         } else {
6711                 rc = -EINVAL;
6712         }
6713         RETURN(rc);
6714 }
6715
6716 static int mdt_ioc_version_get(struct mdt_thread_info *mti, void *karg)
6717 {
6718         struct obd_ioctl_data *data = karg;
6719         struct lu_fid *fid;
6720         __u64 version;
6721         struct mdt_object *obj;
6722         struct mdt_lock_handle  *lh;
6723         int rc;
6724         ENTRY;
6725
6726         if (data->ioc_inlbuf1 == NULL || data->ioc_inllen1 != sizeof(*fid) ||
6727             data->ioc_inlbuf2 == NULL || data->ioc_inllen2 != sizeof(version))
6728                 RETURN(-EINVAL);
6729
6730         fid = (struct lu_fid *)data->ioc_inlbuf1;
6731
6732         if (!fid_is_sane(fid))
6733                 RETURN(-EINVAL);
6734
6735         CDEBUG(D_IOCTL, "getting version for "DFID"\n", PFID(fid));
6736
6737         lh = &mti->mti_lh[MDT_LH_PARENT];
6738         mdt_lock_reg_init(lh, LCK_CR);
6739
6740         obj = mdt_object_find_lock(mti, fid, lh, MDS_INODELOCK_UPDATE);
6741         if (IS_ERR(obj))
6742                 RETURN(PTR_ERR(obj));
6743
6744         if (mdt_object_remote(obj)) {
6745                 rc = -EREMOTE;
6746                 /**
6747                  * before calling version get the correct MDS should be
6748                  * fid, this is error to find remote object here
6749                  */
6750                 CERROR("nonlocal object "DFID"\n", PFID(fid));
6751         } else if (!mdt_object_exists(obj)) {
6752                 *(__u64 *)data->ioc_inlbuf2 = ENOENT_VERSION;
6753                 rc = -ENOENT;
6754         } else {
6755                 version = dt_version_get(mti->mti_env, mdt_obj2dt(obj));
6756                *(__u64 *)data->ioc_inlbuf2 = version;
6757                 rc = 0;
6758         }
6759         mdt_object_unlock_put(mti, obj, lh, 1);
6760         RETURN(rc);
6761 }
6762
6763 /* ioctls on obd dev */
6764 static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
6765                          void *karg, void __user *uarg)
6766 {
6767         struct lu_env      env;
6768         struct obd_device *obd = exp->exp_obd;
6769         struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev);
6770         struct dt_device  *dt = mdt->mdt_bottom;
6771         int rc;
6772
6773         ENTRY;
6774         CDEBUG(D_IOCTL, "handling ioctl cmd %#x\n", cmd);
6775         rc = lu_env_init(&env, LCT_MD_THREAD);
6776         if (rc)
6777                 RETURN(rc);
6778
6779         switch (cmd) {
6780         case OBD_IOC_SYNC:
6781                 rc = mdt_device_sync(&env, mdt);
6782                 break;
6783         case OBD_IOC_SET_READONLY:
6784                 rc = dt_sync(&env, dt);
6785                 if (rc == 0)
6786                         rc = dt_ro(&env, dt);
6787                 break;
6788         case OBD_IOC_ABORT_RECOVERY:
6789                 CERROR("%s: Aborting recovery for device\n", mdt_obd_name(mdt));
6790                 obd->obd_abort_recovery = 1;
6791                 target_stop_recovery_thread(obd);
6792                 rc = 0;
6793                 break;
6794         case OBD_IOC_CHANGELOG_REG:
6795         case OBD_IOC_CHANGELOG_DEREG:
6796         case OBD_IOC_CHANGELOG_CLEAR:
6797                 rc = mdt->mdt_child->md_ops->mdo_iocontrol(&env,
6798                                                            mdt->mdt_child,
6799                                                            cmd, len, karg);
6800                 break;
6801         case OBD_IOC_START_LFSCK: {
6802                 struct md_device *next = mdt->mdt_child;
6803                 struct obd_ioctl_data *data = karg;
6804                 struct lfsck_start_param lsp;
6805
6806                 if (unlikely(data == NULL)) {
6807                         rc = -EINVAL;
6808                         break;
6809                 }
6810
6811                 lsp.lsp_start = (struct lfsck_start *)(data->ioc_inlbuf1);
6812                 lsp.lsp_index_valid = 0;
6813                 rc = next->md_ops->mdo_iocontrol(&env, next, cmd, 0, &lsp);
6814                 break;
6815         }
6816         case OBD_IOC_STOP_LFSCK: {
6817                 struct md_device        *next = mdt->mdt_child;
6818                 struct obd_ioctl_data   *data = karg;
6819                 struct lfsck_stop        stop;
6820
6821                 stop.ls_status = LS_STOPPED;
6822                 /* Old lfsck utils may pass NULL @stop. */
6823                 if (data->ioc_inlbuf1 == NULL)
6824                         stop.ls_flags = 0;
6825                 else
6826                         stop.ls_flags =
6827                         ((struct lfsck_stop *)(data->ioc_inlbuf1))->ls_flags;
6828
6829                 rc = next->md_ops->mdo_iocontrol(&env, next, cmd, 0, &stop);
6830                 break;
6831         }
6832         case OBD_IOC_QUERY_LFSCK: {
6833                 struct md_device        *next = mdt->mdt_child;
6834                 struct obd_ioctl_data   *data = karg;
6835
6836                 rc = next->md_ops->mdo_iocontrol(&env, next, cmd, 0,
6837                                                  data->ioc_inlbuf1);
6838                 break;
6839         }
6840         case OBD_IOC_GET_OBJ_VERSION: {
6841                 struct mdt_thread_info *mti;
6842                 mti = lu_context_key_get(&env.le_ctx, &mdt_thread_key);
6843                 memset(mti, 0, sizeof *mti);
6844                 mti->mti_env = &env;
6845                 mti->mti_mdt = mdt;
6846                 mti->mti_exp = exp;
6847
6848                 rc = mdt_ioc_version_get(mti, karg);
6849                 break;
6850         }
6851         case OBD_IOC_CATLOGLIST: {
6852                 struct mdt_thread_info *mti;
6853
6854                 mti = lu_context_key_get(&env.le_ctx, &mdt_thread_key);
6855                 lu_local_obj_fid(&mti->mti_tmp_fid1, LLOG_CATALOGS_OID);
6856                 rc = llog_catalog_list(&env, mdt->mdt_bottom, 0, karg,
6857                                        &mti->mti_tmp_fid1);
6858                 break;
6859          }
6860         default:
6861                 rc = -EOPNOTSUPP;
6862                 CERROR("%s: Not supported cmd = %d, rc = %d\n",
6863                         mdt_obd_name(mdt), cmd, rc);
6864         }
6865
6866         lu_env_fini(&env);
6867         RETURN(rc);
6868 }
6869
6870 static int mdt_postrecov(const struct lu_env *env, struct mdt_device *mdt)
6871 {
6872         struct lu_device *ld = md2lu_dev(mdt->mdt_child);
6873         int rc;
6874         ENTRY;
6875
6876         if (!mdt->mdt_skip_lfsck && !mdt->mdt_bottom->dd_rdonly) {
6877                 struct lfsck_start_param lsp;
6878
6879                 lsp.lsp_start = NULL;
6880                 lsp.lsp_index_valid = 0;
6881                 rc = mdt->mdt_child->md_ops->mdo_iocontrol(env, mdt->mdt_child,
6882                                                            OBD_IOC_START_LFSCK,
6883                                                            0, &lsp);
6884                 if (rc != 0 && rc != -EALREADY)
6885                         CWARN("%s: auto trigger paused LFSCK failed: rc = %d\n",
6886                               mdt_obd_name(mdt), rc);
6887         }
6888
6889         rc = ld->ld_ops->ldo_recovery_complete(env, ld);
6890         RETURN(rc);
6891 }
6892
6893 static int mdt_obd_postrecov(struct obd_device *obd)
6894 {
6895         struct lu_env env;
6896         int rc;
6897
6898         rc = lu_env_init(&env, LCT_MD_THREAD);
6899         if (rc)
6900                 RETURN(rc);
6901         rc = mdt_postrecov(&env, mdt_dev(obd->obd_lu_dev));
6902         lu_env_fini(&env);
6903         return rc;
6904 }
6905
6906 static struct obd_ops mdt_obd_device_ops = {
6907         .o_owner          = THIS_MODULE,
6908         .o_set_info_async = mdt_obd_set_info_async,
6909         .o_connect        = mdt_obd_connect,
6910         .o_reconnect      = mdt_obd_reconnect,
6911         .o_disconnect     = mdt_obd_disconnect,
6912         .o_init_export    = mdt_init_export,
6913         .o_destroy_export = mdt_destroy_export,
6914         .o_iocontrol      = mdt_iocontrol,
6915         .o_postrecov      = mdt_obd_postrecov,
6916         /* Data-on-MDT IO methods */
6917         .o_preprw         = mdt_obd_preprw,
6918         .o_commitrw       = mdt_obd_commitrw,
6919 };
6920
6921 static struct lu_device* mdt_device_fini(const struct lu_env *env,
6922                                          struct lu_device *d)
6923 {
6924         struct mdt_device *m = mdt_dev(d);
6925         ENTRY;
6926
6927         mdt_fini(env, m);
6928         RETURN(NULL);
6929 }
6930
6931 static struct lu_device *mdt_device_free(const struct lu_env *env,
6932                                          struct lu_device *d)
6933 {
6934         struct mdt_device *m = mdt_dev(d);
6935         ENTRY;
6936
6937         lu_device_fini(&m->mdt_lu_dev);
6938         OBD_FREE_PTR(m);
6939
6940         RETURN(NULL);
6941 }
6942
6943 static struct lu_device *mdt_device_alloc(const struct lu_env *env,
6944                                           struct lu_device_type *t,
6945                                           struct lustre_cfg *cfg)
6946 {
6947         struct lu_device  *l;
6948         struct mdt_device *m;
6949
6950         OBD_ALLOC_PTR(m);
6951         if (m != NULL) {
6952                 int rc;
6953
6954                 l = &m->mdt_lu_dev;
6955                 rc = mdt_init0(env, m, t, cfg);
6956                 if (rc != 0) {
6957                         mdt_device_free(env, l);
6958                         l = ERR_PTR(rc);
6959                         return l;
6960                 }
6961         } else
6962                 l = ERR_PTR(-ENOMEM);
6963         return l;
6964 }
6965
6966 /* context key constructor/destructor: mdt_key_init, mdt_key_fini */
6967 LU_KEY_INIT(mdt, struct mdt_thread_info);
6968
6969 static void mdt_key_fini(const struct lu_context *ctx,
6970                          struct lu_context_key *key, void* data)
6971 {
6972         struct mdt_thread_info *info = data;
6973
6974         if (info->mti_big_lmm) {
6975                 OBD_FREE_LARGE(info->mti_big_lmm, info->mti_big_lmmsize);
6976                 info->mti_big_lmm = NULL;
6977                 info->mti_big_lmmsize = 0;
6978         }
6979
6980         if (info->mti_big_acl) {
6981                 OBD_FREE_LARGE(info->mti_big_acl, info->mti_big_aclsize);
6982                 info->mti_big_acl = NULL;
6983                 info->mti_big_aclsize = 0;
6984         }
6985
6986         OBD_FREE_PTR(info);
6987 }
6988
6989 /* context key: mdt_thread_key */
6990 LU_CONTEXT_KEY_DEFINE(mdt, LCT_MD_THREAD);
6991
6992 struct lu_ucred *mdt_ucred(const struct mdt_thread_info *info)
6993 {
6994         return lu_ucred(info->mti_env);
6995 }
6996
6997 struct lu_ucred *mdt_ucred_check(const struct mdt_thread_info *info)
6998 {
6999         return lu_ucred_check(info->mti_env);
7000 }
7001
7002 /**
7003  * Enable/disable COS (Commit On Sharing).
7004  *
7005  * Set/Clear the COS flag in mdt options.
7006  *
7007  * \param mdt mdt device
7008  * \param val 0 disables COS, other values enable COS
7009  */
7010 void mdt_enable_cos(struct mdt_device *mdt, bool val)
7011 {
7012         struct lu_env env;
7013         int rc;
7014
7015         mdt->mdt_opts.mo_cos = val;
7016         rc = lu_env_init(&env, LCT_LOCAL);
7017         if (unlikely(rc != 0)) {
7018                 CWARN("%s: lu_env initialization failed, cannot "
7019                       "sync: rc = %d\n", mdt_obd_name(mdt), rc);
7020                 return;
7021         }
7022         mdt_device_sync(&env, mdt);
7023         lu_env_fini(&env);
7024 }
7025
7026 /**
7027  * Check COS (Commit On Sharing) status.
7028  *
7029  * Return COS flag status.
7030  *
7031  * \param mdt mdt device
7032  */
7033 int mdt_cos_is_enabled(struct mdt_device *mdt)
7034 {
7035         return mdt->mdt_opts.mo_cos != 0;
7036 }
7037
7038 static struct lu_device_type_operations mdt_device_type_ops = {
7039         .ldto_device_alloc = mdt_device_alloc,
7040         .ldto_device_free  = mdt_device_free,
7041         .ldto_device_fini  = mdt_device_fini
7042 };
7043
7044 static struct lu_device_type mdt_device_type = {
7045         .ldt_tags     = LU_DEVICE_MD,
7046         .ldt_name     = LUSTRE_MDT_NAME,
7047         .ldt_ops      = &mdt_device_type_ops,
7048         .ldt_ctx_tags = LCT_MD_THREAD
7049 };
7050
7051 static int __init mdt_init(void)
7052 {
7053         int rc;
7054
7055         CLASSERT(sizeof("0x0123456789ABCDEF:0x01234567:0x01234567") ==
7056                  FID_NOBRACE_LEN + 1);
7057         CLASSERT(sizeof("[0x0123456789ABCDEF:0x01234567:0x01234567]") ==
7058                  FID_LEN + 1);
7059         rc = lu_kmem_init(mdt_caches);
7060         if (rc)
7061                 return rc;
7062
7063         rc = mds_mod_init();
7064         if (rc)
7065                 GOTO(lu_fini, rc);
7066
7067         rc = class_register_type(&mdt_obd_device_ops, NULL, true, NULL,
7068                                  LUSTRE_MDT_NAME, &mdt_device_type);
7069         if (rc)
7070                 GOTO(mds_fini, rc);
7071 lu_fini:
7072         if (rc)
7073                 lu_kmem_fini(mdt_caches);
7074 mds_fini:
7075         if (rc)
7076                 mds_mod_exit();
7077         return rc;
7078 }
7079
7080 static void __exit mdt_exit(void)
7081 {
7082         class_unregister_type(LUSTRE_MDT_NAME);
7083         mds_mod_exit();
7084         lu_kmem_fini(mdt_caches);
7085 }
7086
7087 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
7088 MODULE_DESCRIPTION("Lustre Metadata Target ("LUSTRE_MDT_NAME")");
7089 MODULE_VERSION(LUSTRE_VERSION_STRING);
7090 MODULE_LICENSE("GPL");
7091
7092 module_init(mdt_init);
7093 module_exit(mdt_exit);