Whamcloud - gitweb
c8cec71f588e3cc690aaed8bfd38f03f17b6e9d3
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_handler.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  *
32  * lustre/osd/osd_handler.c
33  *
34  * Top-level entry points into osd module
35  *
36  * Author: Nikita Danilov <nikita@clusterfs.com>
37  *         Pravin Shelar <pravin.shelar@sun.com> : Added fid in dirent
38  */
39
40 #define DEBUG_SUBSYSTEM S_OSD
41
42 #include <linux/fs_struct.h>
43 #include <linux/kallsyms.h>
44 #include <linux/module.h>
45 #include <linux/user_namespace.h>
46 #include <linux/uidgid.h>
47
48 /* prerequisite for linux/xattr.h */
49 #include <linux/types.h>
50 /* prerequisite for linux/xattr.h */
51 #include <linux/fs.h>
52 /* XATTR_{REPLACE,CREATE} */
53 #include <linux/xattr.h>
54
55 #include <ldiskfs/ldiskfs.h>
56 #include <ldiskfs/xattr.h>
57 #include <ldiskfs/ldiskfs_extents.h>
58 #undef ENTRY
59 /*
60  * struct OBD_{ALLOC,FREE}*()
61  * OBD_FAIL_CHECK
62  */
63 #include <obd_support.h>
64 /* struct ptlrpc_thread */
65 #include <lustre_net.h>
66 #include <lustre_fid.h>
67 /* process_config */
68 #include <uapi/linux/lustre/lustre_param.h>
69
70 #include "osd_internal.h"
71 #include "osd_dynlocks.h"
72
73 /* llo_* api support */
74 #include <md_object.h>
75 #include <lustre_quota.h>
76
77 #include <lustre_linkea.h>
78
79 /* Maximum EA size is limited by LNET_MTU for remote objects */
80 #define OSD_MAX_EA_SIZE 1048364
81
82 int ldiskfs_pdo = 1;
83 module_param(ldiskfs_pdo, int, 0644);
84 MODULE_PARM_DESC(ldiskfs_pdo, "ldiskfs with parallel directory operations");
85
86 int ldiskfs_track_declares_assert;
87 module_param(ldiskfs_track_declares_assert, int, 0644);
88 MODULE_PARM_DESC(ldiskfs_track_declares_assert, "LBUG during tracking of declares");
89
90 /* Slab to allocate dynlocks */
91 struct kmem_cache *dynlock_cachep;
92
93 /* Slab to allocate osd_it_ea */
94 struct kmem_cache *osd_itea_cachep;
95
96 static struct lu_kmem_descr ldiskfs_caches[] = {
97         {
98                 .ckd_cache = &dynlock_cachep,
99                 .ckd_name  = "dynlock_cache",
100                 .ckd_size  = sizeof(struct dynlock_handle)
101         },
102         {
103                 .ckd_cache = &osd_itea_cachep,
104                 .ckd_name  = "osd_itea_cache",
105                 .ckd_size  = sizeof(struct osd_it_ea)
106         },
107         {
108                 .ckd_cache = NULL
109         }
110 };
111
112 static const char dot[] = ".";
113 static const char dotdot[] = "..";
114
115 static const struct lu_object_operations      osd_lu_obj_ops;
116 static const struct dt_object_operations      osd_obj_ops;
117 static const struct dt_object_operations      osd_obj_otable_it_ops;
118 static const struct dt_index_operations       osd_index_iam_ops;
119 static const struct dt_index_operations       osd_index_ea_ops;
120
121 static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd,
122                           const struct lu_fid *fid);
123 static int osd_process_scheduled_agent_removals(const struct lu_env *env,
124                                                 struct osd_device *osd);
125
126 int osd_trans_declare_op2rb[] = {
127         [OSD_OT_ATTR_SET]       = OSD_OT_ATTR_SET,
128         [OSD_OT_PUNCH]          = OSD_OT_MAX,
129         [OSD_OT_XATTR_SET]      = OSD_OT_XATTR_SET,
130         [OSD_OT_CREATE]         = OSD_OT_DESTROY,
131         [OSD_OT_DESTROY]        = OSD_OT_CREATE,
132         [OSD_OT_REF_ADD]        = OSD_OT_REF_DEL,
133         [OSD_OT_REF_DEL]        = OSD_OT_REF_ADD,
134         [OSD_OT_WRITE]          = OSD_OT_WRITE,
135         [OSD_OT_INSERT]         = OSD_OT_DELETE,
136         [OSD_OT_DELETE]         = OSD_OT_INSERT,
137         [OSD_OT_QUOTA]          = OSD_OT_MAX,
138 };
139
140 static int osd_has_index(const struct osd_object *obj)
141 {
142         return obj->oo_dt.do_index_ops != NULL;
143 }
144
145 static int osd_object_invariant(const struct lu_object *l)
146 {
147         return osd_invariant(osd_obj(l));
148 }
149
150 /*
151  * Concurrency: doesn't matter
152  */
153 static int osd_is_write_locked(const struct lu_env *env, struct osd_object *o)
154 {
155         struct osd_thread_info *oti = osd_oti_get(env);
156
157         return oti->oti_w_locks > 0 && o->oo_owner == env;
158 }
159
160 /*
161  * Concurrency: doesn't access mutable data
162  */
163 static int osd_root_get(const struct lu_env *env,
164                         struct dt_device *dev, struct lu_fid *f)
165 {
166         lu_local_obj_fid(f, OSD_FS_ROOT_OID);
167         return 0;
168 }
169
170 /*
171  * the following set of functions are used to maintain per-thread
172  * cache of FID->ino mapping. this mechanism is needed to resolve
173  * FID to inode at dt_insert() which in turn stores ino in the
174  * directory entries to keep ldiskfs compatible with ext[34].
175  * due to locking-originated restrictions we can't lookup ino
176  * using LU cache (deadlock is possible). lookup using OI is quite
177  * expensive. so instead we maintain this cache and methods like
178  * dt_create() fill it. so in the majority of cases dt_insert() is
179  * able to find needed mapping in lockless manner.
180  */
181 static struct osd_idmap_cache *
182 osd_idc_find(const struct lu_env *env, struct osd_device *osd,
183              const struct lu_fid *fid)
184 {
185         struct osd_thread_info *oti = osd_oti_get(env);
186         struct osd_idmap_cache *idc = oti->oti_ins_cache;
187         int i;
188
189         for (i = 0; i < oti->oti_ins_cache_used; i++) {
190                 if (!lu_fid_eq(&idc[i].oic_fid, fid))
191                         continue;
192                 if (idc[i].oic_dev != osd)
193                         continue;
194
195                 return idc + i;
196         }
197
198         return NULL;
199 }
200
201 static struct osd_idmap_cache *
202 osd_idc_add(const struct lu_env *env, struct osd_device *osd,
203             const struct lu_fid *fid)
204 {
205         struct osd_thread_info *oti   = osd_oti_get(env);
206         struct osd_idmap_cache *idc;
207         int i;
208
209         if (unlikely(oti->oti_ins_cache_used >= oti->oti_ins_cache_size)) {
210                 i = oti->oti_ins_cache_size * 2;
211                 if (i == 0)
212                         i = OSD_INS_CACHE_SIZE;
213                 OBD_ALLOC_PTR_ARRAY(idc, i);
214                 if (idc == NULL)
215                         return ERR_PTR(-ENOMEM);
216                 if (oti->oti_ins_cache != NULL) {
217                         memcpy(idc, oti->oti_ins_cache,
218                                oti->oti_ins_cache_used * sizeof(*idc));
219                         OBD_FREE_PTR_ARRAY(oti->oti_ins_cache,
220                                            oti->oti_ins_cache_used);
221                 }
222                 oti->oti_ins_cache = idc;
223                 oti->oti_ins_cache_size = i;
224         }
225
226         idc = oti->oti_ins_cache + oti->oti_ins_cache_used++;
227         idc->oic_fid = *fid;
228         idc->oic_dev = osd;
229         idc->oic_lid.oii_ino = 0;
230         idc->oic_lid.oii_gen = 0;
231         idc->oic_remote = 0;
232
233         return idc;
234 }
235
236 /*
237  * lookup mapping for the given fid in the cache, initialize a
238  * new one if not found. the initialization checks whether the
239  * object is local or remote. for local objects, OI is used to
240  * learn ino/generation. the function is used when the caller
241  * has no information about the object, e.g. at dt_insert().
242  */
243 static struct osd_idmap_cache *
244 osd_idc_find_or_init(const struct lu_env *env, struct osd_device *osd,
245                      const struct lu_fid *fid)
246 {
247         struct osd_idmap_cache *idc;
248         int rc;
249
250         idc = osd_idc_find(env, osd, fid);
251         LASSERT(!IS_ERR(idc));
252         if (idc != NULL)
253                 return idc;
254
255         CDEBUG(D_INODE, "%s: FID "DFID" not in the id map cache\n",
256                osd->od_svname, PFID(fid));
257
258         /* new mapping is needed */
259         idc = osd_idc_add(env, osd, fid);
260         if (IS_ERR(idc)) {
261                 CERROR("%s: FID "DFID" add id map cache failed: %ld\n",
262                        osd->od_svname, PFID(fid), PTR_ERR(idc));
263                 return idc;
264         }
265
266         /* initialize it */
267         rc = osd_remote_fid(env, osd, fid);
268         if (unlikely(rc < 0))
269                 return ERR_PTR(rc);
270
271         if (rc == 0) {
272                 /* the object is local, lookup in OI */
273                 /* XXX: probably cheaper to lookup in LU first? */
274                 rc = osd_oi_lookup(osd_oti_get(env), osd, fid,
275                                    &idc->oic_lid, 0);
276                 if (unlikely(rc < 0)) {
277                         CERROR("can't lookup: rc = %d\n", rc);
278                         return ERR_PTR(rc);
279                 }
280         } else {
281                 /* the object is remote */
282                 idc->oic_remote = 1;
283         }
284
285         return idc;
286 }
287
288 /*
289  * lookup mapping for given FID and fill it from the given object.
290  * the object is lolcal by definition.
291  */
292 static int osd_idc_find_and_init(const struct lu_env *env,
293                                  struct osd_device *osd,
294                                  struct osd_object *obj)
295 {
296         const struct lu_fid *fid = lu_object_fid(&obj->oo_dt.do_lu);
297         struct osd_idmap_cache *idc;
298
299         idc = osd_idc_find(env, osd, fid);
300         LASSERT(!IS_ERR(idc));
301         if (idc != NULL) {
302                 if (obj->oo_inode == NULL)
303                         return 0;
304                 if (idc->oic_lid.oii_ino != obj->oo_inode->i_ino) {
305                         LASSERT(idc->oic_lid.oii_ino == 0);
306                         idc->oic_lid.oii_ino = obj->oo_inode->i_ino;
307                         idc->oic_lid.oii_gen = obj->oo_inode->i_generation;
308                 }
309                 return 0;
310         }
311
312         CDEBUG(D_INODE, "%s: FID "DFID" not in the id map cache\n",
313                osd->od_svname, PFID(fid));
314
315         /* new mapping is needed */
316         idc = osd_idc_add(env, osd, fid);
317         if (IS_ERR(idc)) {
318                 CERROR("%s: FID "DFID" add id map cache failed: %ld\n",
319                        osd->od_svname, PFID(fid), PTR_ERR(idc));
320                 return PTR_ERR(idc);
321         }
322
323         if (obj->oo_inode != NULL) {
324                 idc->oic_lid.oii_ino = obj->oo_inode->i_ino;
325                 idc->oic_lid.oii_gen = obj->oo_inode->i_generation;
326         }
327         return 0;
328 }
329
330 /*
331  * OSD object methods.
332  */
333
334 /*
335  * Concurrency: no concurrent access is possible that early in object
336  * life-cycle.
337  */
338 static struct lu_object *osd_object_alloc(const struct lu_env *env,
339                                           const struct lu_object_header *hdr,
340                                           struct lu_device *d)
341 {
342         struct osd_object *mo;
343
344         OBD_ALLOC_PTR(mo);
345         if (mo != NULL) {
346                 struct lu_object *l;
347                 struct lu_object_header *h;
348                 struct osd_device *o = osd_dev(d);
349
350                 l = &mo->oo_dt.do_lu;
351                 if (unlikely(o->od_in_init)) {
352                         OBD_ALLOC_PTR(h);
353                         if (!h) {
354                                 OBD_FREE_PTR(mo);
355                                 return NULL;
356                         }
357
358                         lu_object_header_init(h);
359                         lu_object_init(l, h, d);
360                         lu_object_add_top(h, l);
361                         mo->oo_header = h;
362                 } else {
363                         dt_object_init(&mo->oo_dt, NULL, d);
364                         mo->oo_header = NULL;
365                 }
366
367                 mo->oo_dt.do_ops = &osd_obj_ops;
368                 l->lo_ops = &osd_lu_obj_ops;
369                 init_rwsem(&mo->oo_sem);
370                 init_rwsem(&mo->oo_ext_idx_sem);
371                 spin_lock_init(&mo->oo_guard);
372                 INIT_LIST_HEAD(&mo->oo_xattr_list);
373                 return l;
374         }
375         return NULL;
376 }
377
378 int osd_get_lma(struct osd_thread_info *info, struct inode *inode,
379                 struct dentry *dentry, struct lustre_ost_attrs *loa)
380 {
381         int rc;
382
383         rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMA,
384                              (void *)loa, sizeof(*loa));
385         if (rc > 0) {
386                 struct lustre_mdt_attrs *lma = &loa->loa_lma;
387
388                 if (rc < sizeof(*lma))
389                         return -EINVAL;
390
391                 rc = 0;
392                 lustre_loa_swab(loa, true);
393                 /* Check LMA compatibility */
394                 if (lma->lma_incompat & ~LMA_INCOMPAT_SUPP) {
395                         CWARN("%s: unsupported incompat LMA feature(s) %#x "
396                               "for fid = "DFID", ino = %lu\n",
397                               osd_ino2name(inode),
398                               lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
399                               PFID(&lma->lma_self_fid), inode->i_ino);
400                         rc = -EOPNOTSUPP;
401                 }
402         } else if (rc == 0) {
403                 rc = -ENODATA;
404         }
405
406         return rc;
407 }
408
409 /*
410  * retrieve object from backend ext fs.
411  **/
412 struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev,
413                        struct osd_inode_id *id)
414 {
415         int rc;
416         struct inode *inode = NULL;
417
418         /*
419          * if we look for an inode withing a running
420          * transaction, then we risk to deadlock
421          * osd_dirent_check_repair() breaks this
422          */
423          /* LASSERT(current->journal_info == NULL); */
424
425         inode = osd_ldiskfs_iget(osd_sb(dev), id->oii_ino);
426         if (IS_ERR(inode)) {
427                 CDEBUG(D_INODE, "no inode: ino = %u, rc = %ld\n",
428                        id->oii_ino, PTR_ERR(inode));
429         } else if (id->oii_gen != OSD_OII_NOGEN &&
430                    inode->i_generation != id->oii_gen) {
431                 CDEBUG(D_INODE, "unmatched inode: ino = %u, oii_gen = %u, "
432                        "i_generation = %u\n",
433                        id->oii_ino, id->oii_gen, inode->i_generation);
434                 iput(inode);
435                 inode = ERR_PTR(-ESTALE);
436         } else if (inode->i_nlink == 0) {
437                 /*
438                  * due to parallel readdir and unlink,
439                  * we can have dead inode here.
440                  */
441                 CDEBUG(D_INODE, "stale inode: ino = %u\n", id->oii_ino);
442                 iput(inode);
443                 inode = ERR_PTR(-ESTALE);
444         } else if (is_bad_inode(inode)) {
445                 CWARN("%s: bad inode: ino = %u\n",
446                       osd_dev2name(dev), id->oii_ino);
447                 iput(inode);
448                 inode = ERR_PTR(-ENOENT);
449         } else if ((rc = osd_attach_jinode(inode))) {
450                 iput(inode);
451                 inode = ERR_PTR(rc);
452         } else {
453                 ldiskfs_clear_inode_state(inode, LDISKFS_STATE_LUSTRE_DESTROY);
454                 if (id->oii_gen == OSD_OII_NOGEN)
455                         osd_id_gen(id, inode->i_ino, inode->i_generation);
456
457                 /*
458                  * Do not update file c/mtime in ldiskfs.
459                  * NB: we don't have any lock to protect this because we don't
460                  * have reference on osd_object now, but contention with
461                  * another lookup + attr_set can't happen in the tiny window
462                  * between if (...) and set S_NOCMTIME.
463                  */
464                 if (!(inode->i_flags & S_NOCMTIME))
465                         inode->i_flags |= S_NOCMTIME;
466         }
467         return inode;
468 }
469
470 int osd_ldiskfs_add_entry(struct osd_thread_info *info, struct osd_device *osd,
471                           handle_t *handle, struct dentry *child,
472                           struct inode *inode, struct htree_lock *hlock)
473 {
474         int rc, rc2;
475
476         rc = __ldiskfs_add_entry(handle, child, inode, hlock);
477         if (rc == -ENOBUFS || rc == -ENOSPC) {
478                 struct lustre_ost_attrs *loa = &info->oti_ost_attrs;
479                 struct inode *parent = child->d_parent->d_inode;
480                 struct lu_fid *fid = NULL;
481                 char fidstr[FID_LEN + 1] = "unknown";
482
483                 rc2 = osd_get_lma(info, parent, child->d_parent, loa);
484                 if (!rc2) {
485                         fid = &loa->loa_lma.lma_self_fid;
486                 } else if (rc2 == -ENODATA) {
487                         if (unlikely(parent == inode->i_sb->s_root->d_inode)) {
488                                 fid = &info->oti_fid3;
489                                 lu_local_obj_fid(fid, OSD_FS_ROOT_OID);
490                         } else if (!osd->od_is_ost && osd->od_index == 0) {
491                                 fid = &info->oti_fid3;
492                                 lu_igif_build(fid, parent->i_ino,
493                                               parent->i_generation);
494                         }
495                 }
496
497                 if (fid != NULL)
498                         snprintf(fidstr, sizeof(fidstr), DFID, PFID(fid));
499
500                 /* below message is checked in sanity.sh test_129 */
501                 if (rc == -ENOSPC) {
502                         CWARN("%s: directory (inode: %lu, FID: %s) has reached max size limit\n",
503                               osd_name(osd), parent->i_ino, fidstr);
504                 } else {
505                         rc = 0; /* ignore such error now */
506                         CWARN("%s: directory (inode: %lu, FID: %s) is approaching max size limit\n",
507                               osd_name(osd), parent->i_ino, fidstr);
508                 }
509
510         }
511
512         return rc;
513 }
514
515
516 struct inode *
517 osd_iget_fid(struct osd_thread_info *info, struct osd_device *dev,
518              struct osd_inode_id *id, struct lu_fid *fid)
519 {
520         struct lustre_ost_attrs *loa = &info->oti_ost_attrs;
521         struct inode *inode;
522         int rc;
523
524         inode = osd_iget(info, dev, id);
525         if (IS_ERR(inode))
526                 return inode;
527
528         rc = osd_get_lma(info, inode, &info->oti_obj_dentry, loa);
529         if (!rc) {
530                 *fid = loa->loa_lma.lma_self_fid;
531         } else if (rc == -ENODATA) {
532                 if (unlikely(inode == osd_sb(dev)->s_root->d_inode))
533                         lu_local_obj_fid(fid, OSD_FS_ROOT_OID);
534                 else
535                         lu_igif_build(fid, inode->i_ino, inode->i_generation);
536         } else {
537                 iput(inode);
538                 inode = ERR_PTR(rc);
539         }
540         return inode;
541 }
542
543 static struct inode *osd_iget_check(struct osd_thread_info *info,
544                                     struct osd_device *dev,
545                                     const struct lu_fid *fid,
546                                     struct osd_inode_id *id,
547                                     bool trusted)
548 {
549         struct inode *inode;
550         int rc = 0;
551
552         ENTRY;
553
554         /*
555          * The cached OI mapping is trustable. If we cannot locate the inode
556          * via the cached OI mapping, then return the failure to the caller
557          * directly without further OI checking.
558          */
559
560 again:
561         inode = osd_ldiskfs_iget(osd_sb(dev), id->oii_ino);
562         if (IS_ERR(inode)) {
563                 rc = PTR_ERR(inode);
564                 if (!trusted && (rc == -ENOENT || rc == -ESTALE))
565                         goto check_oi;
566
567                 CDEBUG(D_INODE, "no inode for FID: "DFID", ino = %u, rc = %d\n",
568                        PFID(fid), id->oii_ino, rc);
569                 GOTO(put, rc);
570         }
571
572         if (is_bad_inode(inode)) {
573                 rc = -ENOENT;
574                 if (!trusted)
575                         goto check_oi;
576
577                 CDEBUG(D_INODE, "bad inode for FID: "DFID", ino = %u\n",
578                        PFID(fid), id->oii_ino);
579                 GOTO(put, rc);
580         }
581
582         if (id->oii_gen != OSD_OII_NOGEN &&
583             inode->i_generation != id->oii_gen) {
584                 rc = -ESTALE;
585                 if (!trusted)
586                         goto check_oi;
587
588                 CDEBUG(D_INODE, "unmatched inode for FID: "DFID", ino = %u, "
589                        "oii_gen = %u, i_generation = %u\n", PFID(fid),
590                        id->oii_ino, id->oii_gen, inode->i_generation);
591                 GOTO(put, rc);
592         }
593
594         if (inode->i_nlink == 0) {
595                 rc = -ENOENT;
596                 if (!trusted)
597                         goto check_oi;
598
599                 CDEBUG(D_INODE, "stale inode for FID: "DFID", ino = %u\n",
600                        PFID(fid), id->oii_ino);
601                 GOTO(put, rc);
602         }
603
604         ldiskfs_clear_inode_state(inode, LDISKFS_STATE_LUSTRE_DESTROY);
605
606 check_oi:
607         if (rc != 0) {
608                 __u32 saved_ino = id->oii_ino;
609                 __u32 saved_gen = id->oii_gen;
610
611                 LASSERT(!trusted);
612                 LASSERTF(rc == -ESTALE || rc == -ENOENT, "rc = %d\n", rc);
613
614                 rc = osd_oi_lookup(info, dev, fid, id, OI_CHECK_FLD);
615                 /*
616                  * XXX: There are four possible cases:
617                  *      1. rc = 0.
618                  *         Backup/restore caused the OI invalid.
619                  *      2. rc = 0.
620                  *         Someone unlinked the object but NOT removed
621                  *         the OI mapping, such as mount target device
622                  *         as ldiskfs, and modify something directly.
623                  *      3. rc = -ENOENT.
624                  *         Someone just removed the object between the
625                  *         former oi_lookup and the iget. It is normal.
626                  *      4. Other failure cases.
627                  *
628                  *      Generally, when the device is mounted, it will
629                  *      auto check whether the system is restored from
630                  *      file-level backup or not. We trust such detect
631                  *      to distinguish the 1st case from the 2nd case:
632                  *      if the OI files are consistent but may contain
633                  *      stale OI mappings because of case 2, if iget()
634                  *      returns -ENOENT or -ESTALE, then it should be
635                  *      the case 2.
636                  */
637                 if (rc != 0)
638                         /*
639                          * If the OI mapping was in OI file before the
640                          * osd_iget_check(), but now, it is disappear,
641                          * then it must be removed by race. That is a
642                          * normal race case.
643                          */
644                         GOTO(put, rc);
645
646                 /*
647                  * It is the OI scrub updated the OI mapping by race.
648                  * The new OI mapping must be valid.
649                  */
650                 if (saved_ino != id->oii_ino ||
651                     (saved_gen != id->oii_gen && saved_gen != OSD_OII_NOGEN)) {
652                         if (!IS_ERR(inode))
653                                 iput(inode);
654
655                         trusted = true;
656                         goto again;
657                 }
658
659                 if (IS_ERR(inode)) {
660                         if (dev->od_scrub.os_scrub.os_file.sf_flags &
661                             SF_INCONSISTENT)
662                                 /*
663                                  * It still can be the case 2, but we cannot
664                                  * distinguish it from the case 1. So return
665                                  * -EREMCHG to block current operation until
666                                  *  OI scrub rebuilt the OI mappings.
667                                  */
668                                 rc = -EREMCHG;
669                         else
670                                 rc = -ENOENT;
671
672                         GOTO(put, rc);
673                 }
674
675                 if (inode->i_generation == id->oii_gen)
676                         rc = -ENOENT;
677                 else
678                         rc = -EREMCHG;
679         } else {
680                 if (id->oii_gen == OSD_OII_NOGEN)
681                         osd_id_gen(id, inode->i_ino, inode->i_generation);
682
683                 /*
684                  * Do not update file c/mtime in ldiskfs.
685                  * NB: we don't have any lock to protect this because we don't
686                  * have reference on osd_object now, but contention with
687                  * another lookup + attr_set can't happen in the tiny window
688                  * between if (...) and set S_NOCMTIME.
689                  */
690                 if (!(inode->i_flags & S_NOCMTIME))
691                         inode->i_flags |= S_NOCMTIME;
692         }
693
694         GOTO(put, rc);
695
696 put:
697         if (rc != 0) {
698                 if (!IS_ERR(inode))
699                         iput(inode);
700
701                 inode = ERR_PTR(rc);
702         }
703
704         return inode;
705 }
706
707 /**
708  * \retval +v: new filter_fid does not contain self-fid
709  * \retval 0:  filter_fid_18_23, contains self-fid
710  * \retval -v: other failure cases
711  */
712 int osd_get_idif(struct osd_thread_info *info, struct inode *inode,
713                  struct dentry *dentry, struct lu_fid *fid)
714 {
715         struct filter_fid *ff = &info->oti_ff;
716         struct ost_id *ostid = &info->oti_ostid;
717         int rc;
718
719         rc = __osd_xattr_get(inode, dentry, XATTR_NAME_FID, ff, sizeof(*ff));
720         if (rc == sizeof(struct filter_fid_18_23)) {
721                 struct filter_fid_18_23 *ff_old = (void *)ff;
722
723                 ostid_set_seq(ostid, le64_to_cpu(ff_old->ff_seq));
724                 rc = ostid_set_id(ostid, le64_to_cpu(ff_old->ff_objid));
725                 /*
726                  * XXX: use 0 as the index for compatibility, the caller will
727                  * handle index related issues when necessary.
728                  */
729                 if (!rc)
730                         ostid_to_fid(fid, ostid, 0);
731         } else if (rc >= (int)sizeof(struct filter_fid_24_29)) {
732                 rc = 1;
733         } else if (rc >= 0) {
734                 rc = -EINVAL;
735         }
736
737         return rc;
738 }
739
740 static int osd_lma_self_repair(struct osd_thread_info *info,
741                                struct osd_device *osd, struct inode *inode,
742                                const struct lu_fid *fid, __u32 compat)
743 {
744         handle_t *jh;
745         int rc;
746
747         LASSERT(current->journal_info == NULL);
748
749         jh = osd_journal_start_sb(osd_sb(osd), LDISKFS_HT_MISC,
750                                   osd_dto_credits_noquota[DTO_XATTR_SET]);
751         if (IS_ERR(jh)) {
752                 rc = PTR_ERR(jh);
753                 CWARN("%s: cannot start journal for lma_self_repair: rc = %d\n",
754                       osd_name(osd), rc);
755                 return rc;
756         }
757
758         rc = osd_ea_fid_set(info, inode, fid, compat, 0);
759         if (rc != 0)
760                 CWARN("%s: cannot self repair the LMA: rc = %d\n",
761                       osd_name(osd), rc);
762         ldiskfs_journal_stop(jh);
763         return rc;
764 }
765
766 static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
767 {
768         struct osd_thread_info *info = osd_oti_get(env);
769         struct osd_device *osd = osd_obj2dev(obj);
770         struct lustre_ost_attrs *loa = &info->oti_ost_attrs;
771         struct lustre_mdt_attrs *lma = &loa->loa_lma;
772         struct inode *inode = obj->oo_inode;
773         struct dentry *dentry = &info->oti_obj_dentry;
774         struct lu_fid *fid = NULL;
775         const struct lu_fid *rfid = lu_object_fid(&obj->oo_dt.do_lu);
776         int rc;
777
778         ENTRY;
779
780         rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMA,
781                              (void *)loa, sizeof(*loa));
782         if (rc == -ENODATA && !fid_is_igif(rfid) && osd->od_check_ff) {
783                 fid = &lma->lma_self_fid;
784                 rc = osd_get_idif(info, inode, dentry, fid);
785                 if (rc > 0 || (rc == -ENODATA && osd->od_index_in_idif)) {
786                         /*
787                          * For the given OST-object, if it has neither LMA nor
788                          * FID in XATTR_NAME_FID, then the given FID (which is
789                          * contained in the @obj, from client RPC for locating
790                          * the OST-object) is trusted. We use it to generate
791                          * the LMA.
792                          */
793                         osd_lma_self_repair(info, osd, inode, rfid,
794                                             LMAC_FID_ON_OST);
795                         RETURN(0);
796                 }
797         }
798
799         if (rc < 0)
800                 RETURN(rc);
801
802         if (rc > 0) {
803                 rc = 0;
804                 lustre_lma_swab(lma);
805                 if (unlikely((lma->lma_incompat & ~LMA_INCOMPAT_SUPP) ||
806                              (CFS_FAIL_CHECK(OBD_FAIL_OSD_LMA_INCOMPAT) &&
807                               S_ISREG(inode->i_mode)))) {
808                         CWARN("%s: unsupported incompat LMA feature(s) %#x for "
809                               "fid = "DFID", ino = %lu\n", osd_name(osd),
810                               lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
811                               PFID(rfid), inode->i_ino);
812                         rc = -EOPNOTSUPP;
813                 } else {
814                         fid = &lma->lma_self_fid;
815                         if (lma->lma_compat & LMAC_STRIPE_INFO &&
816                             osd->od_is_ost)
817                                 obj->oo_pfid_in_lma = 1;
818                         if (unlikely(lma->lma_incompat & LMAI_REMOTE_PARENT) &&
819                             !osd->od_is_ost)
820                                 lu_object_set_agent_entry(&obj->oo_dt.do_lu);
821                 }
822         }
823
824         if (fid != NULL && unlikely(!lu_fid_eq(rfid, fid))) {
825                 if (fid_is_idif(rfid) && fid_is_idif(fid)) {
826                         struct ost_id   *oi   = &info->oti_ostid;
827                         struct lu_fid   *fid1 = &info->oti_fid3;
828                         __u32            idx  = fid_idif_ost_idx(rfid);
829
830                         /*
831                          * For old IDIF, the OST index is not part of the IDIF,
832                          * Means that different OSTs may have the same IDIFs.
833                          * Under such case, we need to make some compatible
834                          * check to make sure to trigger OI scrub properly.
835                          */
836                         if (idx != 0 && fid_idif_ost_idx(fid) == 0) {
837                                 /* Given @rfid is new, LMA is old. */
838                                 fid_to_ostid(fid, oi);
839                                 ostid_to_fid(fid1, oi, idx);
840                                 if (lu_fid_eq(fid1, rfid)) {
841                                         if (osd->od_index_in_idif)
842                                                 osd_lma_self_repair(info, osd,
843                                                         inode, rfid,
844                                                         LMAC_FID_ON_OST);
845                                         RETURN(0);
846                                 }
847                         }
848                 }
849
850                 rc = -EREMCHG;
851         }
852
853         RETURN(rc);
854 }
855
856 struct osd_check_lmv_buf {
857         /* please keep it as first member */
858         struct dir_context ctx;
859         struct osd_thread_info *oclb_info;
860         struct osd_device *oclb_dev;
861         struct osd_idmap_cache *oclb_oic;
862         int oclb_items;
863         bool oclb_found;
864 };
865
866 /**
867  * It is called internally by ->iterate*() to filter out the
868  * local slave object's FID of the striped directory.
869  *
870  * \retval      1 found the local slave's FID
871  * \retval      0 continue to check next item
872  * \retval      -ve for failure
873  */
874 #ifdef HAVE_FILLDIR_USE_CTX
875 static int osd_stripe_dir_filldir(struct dir_context *buf,
876 #else
877 static int osd_stripe_dir_filldir(void *buf,
878 #endif
879                                   const char *name, int namelen,
880                                   loff_t offset, __u64 ino, unsigned int d_type)
881 {
882         struct osd_check_lmv_buf *oclb = (struct osd_check_lmv_buf *)buf;
883         struct osd_thread_info *oti = oclb->oclb_info;
884         struct lu_fid *fid = &oti->oti_fid3;
885         struct osd_inode_id *id = &oti->oti_id3;
886         struct osd_device *dev = oclb->oclb_dev;
887         struct osd_idmap_cache *oic = oclb->oclb_oic;
888         struct inode *inode;
889
890         oclb->oclb_items++;
891
892         if (name[0] == '.')
893                 return 0;
894
895         fid_zero(fid);
896         sscanf(name + 1, SFID, RFID(fid));
897         if (!fid_is_sane(fid))
898                 return 0;
899
900         if (osd_remote_fid(oti->oti_env, dev, fid))
901                 return 0;
902
903         osd_id_gen(id, ino, OSD_OII_NOGEN);
904         inode = osd_iget(oti, dev, id);
905         if (IS_ERR(inode))
906                 return PTR_ERR(inode);
907
908         iput(inode);
909         osd_add_oi_cache(oti, dev, id, fid);
910         oic->oic_fid = *fid;
911         oic->oic_lid = *id;
912         oic->oic_dev = dev;
913         osd_oii_insert(dev, oic, true);
914         oclb->oclb_found = true;
915
916         return 1;
917 }
918
919 /*
920  * When lookup item under striped directory, we need to locate the master
921  * MDT-object of the striped directory firstly, then the client will send
922  * lookup (getattr_by_name) RPC to the MDT with some slave MDT-object's FID
923  * and the item's name. If the system is restored from MDT file level backup,
924  * then before the OI scrub completely built the OI files, the OI mappings of
925  * the master MDT-object and slave MDT-object may be invalid. Usually, it is
926  * not a problem for the master MDT-object. Because when locate the master
927  * MDT-object, we will do name based lookup (for the striped directory itself)
928  * firstly, during such process we can setup the correct OI mapping for the
929  * master MDT-object. But it will be trouble for the slave MDT-object. Because
930  * the client will not trigger name based lookup on the MDT to locate the slave
931  * MDT-object before locating item under the striped directory, then when
932  * osd_fid_lookup(), it will find that the OI mapping for the slave MDT-object
933  * is invalid and does not know what the right OI mapping is, then the MDT has
934  * to return -EINPROGRESS to the client to notify that the OI scrub is rebuiding
935  * the OI file, related OI mapping is unknown yet, please try again later. And
936  * then client will re-try the RPC again and again until related OI mapping has
937  * been updated. That is quite inefficient.
938  *
939  * To resolve above trouble, we will handle it as the following two cases:
940  *
941  * 1) The slave MDT-object and the master MDT-object are on different MDTs.
942  *    It is relative easy. Be as one of remote MDT-objects, the slave MDT-object
943  *    is linked under /REMOTE_PARENT_DIR with the name of its FID string.
944  *    We can locate the slave MDT-object via lookup the /REMOTE_PARENT_DIR
945  *    directly. Please check osd_fid_lookup().
946  *
947  * 2) The slave MDT-object and the master MDT-object reside on the same MDT.
948  *    Under such case, during lookup the master MDT-object, we will lookup the
949  *    slave MDT-object via readdir against the master MDT-object, because the
950  *    slave MDT-objects information are stored as sub-directories with the name
951  *    "${FID}:${index}". Then when find the local slave MDT-object, its OI
952  *    mapping will be recorded. Then subsequent osd_fid_lookup() will know
953  *    the correct OI mapping for the slave MDT-object.
954  */
955 static int osd_check_lmv(struct osd_thread_info *oti, struct osd_device *dev,
956                          struct inode *inode, struct osd_idmap_cache *oic)
957 {
958         struct lu_buf *buf = &oti->oti_big_buf;
959         struct dentry *dentry = &oti->oti_obj_dentry;
960         struct file *filp;
961         struct lmv_mds_md_v1 *lmv1;
962         struct osd_check_lmv_buf oclb = {
963                 .ctx.actor = osd_stripe_dir_filldir,
964                 .oclb_info = oti,
965                 .oclb_dev = dev,
966                 .oclb_oic = oic,
967                 .oclb_found = false,
968         };
969         int rc = 0;
970
971         ENTRY;
972
973 again:
974         rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMV, buf->lb_buf,
975                              buf->lb_len);
976         if (rc == -ERANGE) {
977                 rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMV, NULL, 0);
978                 if (rc > 0) {
979                         lu_buf_realloc(buf, rc);
980                         if (buf->lb_buf == NULL)
981                                 GOTO(out, rc = -ENOMEM);
982
983                         goto again;
984                 }
985         }
986
987         if (unlikely(rc == 0 || rc == -ENODATA))
988                 GOTO(out, rc = 0);
989
990         if (rc < 0)
991                 GOTO(out, rc);
992
993         if (unlikely(buf->lb_buf == NULL)) {
994                 lu_buf_realloc(buf, rc);
995                 if (buf->lb_buf == NULL)
996                         GOTO(out, rc = -ENOMEM);
997
998                 goto again;
999         }
1000
1001         lmv1 = buf->lb_buf;
1002         if (le32_to_cpu(lmv1->lmv_magic) != LMV_MAGIC_V1)
1003                 GOTO(out, rc = 0);
1004
1005         filp = osd_quasi_file(oti->oti_env, inode);
1006         rc = osd_security_file_alloc(filp);
1007         if (rc)
1008                 goto out;
1009
1010         do {
1011                 oclb.oclb_items = 0;
1012                 rc = iterate_dir(filp, &oclb.ctx);
1013         } while (rc >= 0 && oclb.oclb_items > 0 && !oclb.oclb_found &&
1014                  filp->f_pos != LDISKFS_HTREE_EOF_64BIT);
1015         inode->i_fop->release(inode, filp);
1016
1017 out:
1018         if (rc < 0)
1019                 CDEBUG(D_LFSCK, "%s: fail to check LMV EA, inode = %lu/%u,"
1020                        DFID": rc = %d\n", osd_ino2name(inode),
1021                        inode->i_ino, inode->i_generation,
1022                        PFID(&oic->oic_fid), rc);
1023         else
1024                 rc = 0;
1025
1026         RETURN(rc);
1027 }
1028
1029 static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj,
1030                           const struct lu_fid *fid,
1031                           const struct lu_object_conf *conf)
1032 {
1033         struct osd_thread_info *info;
1034         struct lu_device *ldev = obj->oo_dt.do_lu.lo_dev;
1035         struct osd_device *dev;
1036         struct osd_idmap_cache *oic;
1037         struct osd_inode_id *id;
1038         struct inode *inode = NULL;
1039         struct lustre_scrub *scrub;
1040         struct scrub_file *sf;
1041         __u32 flags = SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT | SS_AUTO_FULL;
1042         __u32 saved_ino;
1043         __u32 saved_gen;
1044         int result = 0;
1045         int rc1 = 0;
1046         bool remote = false;
1047         bool trusted = true;
1048         bool updated = false;
1049         bool checked = false;
1050
1051         ENTRY;
1052
1053         LINVRNT(osd_invariant(obj));
1054         LASSERT(obj->oo_inode == NULL);
1055         LASSERTF(fid_is_sane(fid) || fid_is_idif(fid), DFID"\n", PFID(fid));
1056
1057         dev = osd_dev(ldev);
1058         scrub = &dev->od_scrub.os_scrub;
1059         sf = &scrub->os_file;
1060         info = osd_oti_get(env);
1061         LASSERT(info);
1062         oic = &info->oti_cache;
1063
1064         if (OBD_FAIL_CHECK(OBD_FAIL_SRV_ENOENT))
1065                 RETURN(-ENOENT);
1066
1067         /*
1068          * For the object is created as locking anchor, or for the object to
1069          * be created on disk. No need to osd_oi_lookup() at here because FID
1070          * shouldn't never be re-used, if it's really a duplicate FID from
1071          * unexpected reason, we should be able to detect it later by calling
1072          * do_create->osd_oi_insert().
1073          */
1074         if (conf && conf->loc_flags & LOC_F_NEW)
1075                 GOTO(out, result = 0);
1076
1077         /* Search order: 1. per-thread cache. */
1078         if (lu_fid_eq(fid, &oic->oic_fid) && likely(oic->oic_dev == dev)) {
1079                 id = &oic->oic_lid;
1080                 goto iget;
1081         }
1082
1083         id = &info->oti_id;
1084         if (!list_empty(&scrub->os_inconsistent_items)) {
1085                 /* Search order: 2. OI scrub pending list. */
1086                 result = osd_oii_lookup(dev, fid, id);
1087                 if (!result)
1088                         goto iget;
1089         }
1090
1091         /*
1092          * The OI mapping in the OI file can be updated by the OI scrub
1093          * when we locate the inode via FID. So it may be not trustable.
1094          */
1095         trusted = false;
1096
1097         /* Search order: 3. OI files. */
1098         result = osd_oi_lookup(info, dev, fid, id, OI_CHECK_FLD);
1099         if (result == -ENOENT) {
1100                 if (!(fid_is_norm(fid) || fid_is_igif(fid)) ||
1101                     fid_is_on_ost(info, dev, fid, OI_CHECK_FLD) ||
1102                     !ldiskfs_test_bit(osd_oi_fid2idx(dev, fid),
1103                                       sf->sf_oi_bitmap))
1104                         GOTO(out, result = 0);
1105
1106                 goto trigger;
1107         }
1108
1109         /* -ESTALE is returned if inode of OST object doesn't exist */
1110         if (result == -ESTALE &&
1111             fid_is_on_ost(info, dev, fid, OI_CHECK_FLD)) {
1112                 GOTO(out, result = 0);
1113         }
1114
1115         if (result)
1116                 GOTO(out, result);
1117
1118 iget:
1119         obj->oo_inode = NULL;
1120         /* for later passes through checks, not true on first pass */
1121         if (!IS_ERR_OR_NULL(inode))
1122                 iput(inode);
1123
1124         inode = osd_iget_check(info, dev, fid, id, trusted);
1125         if (!IS_ERR(inode)) {
1126                 obj->oo_inode = inode;
1127                 result = 0;
1128                 if (remote)
1129                         goto trigger;
1130
1131                 goto check_lma;
1132         }
1133
1134         result = PTR_ERR(inode);
1135         if (result == -ENOENT || result == -ESTALE)
1136                 GOTO(out, result = 0);
1137
1138         if (result != -EREMCHG)
1139                 GOTO(out, result);
1140
1141 trigger:
1142         /*
1143          * We still have chance to get the valid inode: for the
1144          * object which is referenced by remote name entry, the
1145          * object on the local MDT will be linked under the dir
1146          * of "/REMOTE_PARENT_DIR" with its FID string as name.
1147          *
1148          * We do not know whether the object for the given FID
1149          * is referenced by some remote name entry or not, and
1150          * especially for DNE II, a multiple-linked object may
1151          * have many name entries reside on many MDTs.
1152          *
1153          * To simplify the operation, OSD will not distinguish
1154          * more, just lookup "/REMOTE_PARENT_DIR". Usually, it
1155          * only happened for the RPC from other MDT during the
1156          * OI scrub, or for the client side RPC with FID only,
1157          * such as FID to path, or from old connected client.
1158          */
1159         if (!remote) {
1160                 rc1 = osd_lookup_in_remote_parent(info, dev, fid, id);
1161                 if (!rc1) {
1162                         remote = true;
1163                         trusted = true;
1164                         flags |= SS_AUTO_PARTIAL;
1165                         flags &= ~SS_AUTO_FULL;
1166                         goto iget;
1167                 }
1168         }
1169
1170         if (scrub->os_running) {
1171                 if (scrub->os_partial_scan && !scrub->os_in_join)
1172                         goto join;
1173
1174                 osd_add_oi_cache(info, dev, id, fid);
1175                 if (IS_ERR_OR_NULL(inode) || result) {
1176                         osd_oii_insert(dev, oic, result == -ENOENT);
1177                         GOTO(out, result = -EINPROGRESS);
1178                 }
1179
1180                 LASSERT(remote);
1181                 LASSERT(obj->oo_inode == inode);
1182
1183                 osd_oii_insert(dev, oic, true);
1184                 goto found;
1185         }
1186
1187         if (dev->od_auto_scrub_interval == AS_NEVER) {
1188                 if (!remote)
1189                         GOTO(out, result = -EREMCHG);
1190
1191                 LASSERT(!result);
1192                 LASSERT(obj->oo_inode == inode);
1193
1194                 osd_add_oi_cache(info, dev, id, fid);
1195                 goto found;
1196         }
1197
1198 join:
1199         rc1 = osd_scrub_start(env, dev, flags);
1200         LCONSOLE_WARN("%s: trigger OI scrub by RPC for the " DFID" with flags "
1201                       "0x%x, rc = %d\n", osd_name(dev), PFID(fid), flags, rc1);
1202         if (rc1 && rc1 != -EALREADY)
1203                 GOTO(out, result = -EREMCHG);
1204
1205         osd_add_oi_cache(info, dev, id, fid);
1206         if (IS_ERR_OR_NULL(inode) || result) {
1207                 osd_oii_insert(dev, oic, result == -ENOENT);
1208                 GOTO(out, result = -EINPROGRESS);
1209         }
1210
1211         LASSERT(remote);
1212         LASSERT(obj->oo_inode == inode);
1213
1214         osd_oii_insert(dev, oic, true);
1215         goto found;
1216
1217 check_lma:
1218         checked = true;
1219         if (unlikely(obj->oo_header))
1220                 goto found;
1221
1222         result = osd_check_lma(env, obj);
1223         if (!result)
1224                 goto found;
1225
1226         LASSERTF(id->oii_ino == inode->i_ino &&
1227                  id->oii_gen == inode->i_generation,
1228                  "locate wrong inode for FID: "DFID", %u/%u => %ld/%u\n",
1229                  PFID(fid), id->oii_ino, id->oii_gen,
1230                  inode->i_ino, inode->i_generation);
1231
1232         saved_ino = inode->i_ino;
1233         saved_gen = inode->i_generation;
1234
1235         if (unlikely(result == -ENODATA)) {
1236                 /*
1237                  * If the OI scrub updated the OI mapping by race, it
1238                  * must be valid. Trust the inode that has no LMA EA.
1239                  */
1240                 if (updated)
1241                         goto found;
1242
1243                 result = osd_oi_lookup(info, dev, fid, id, OI_CHECK_FLD);
1244                 if (!result) {
1245                         /*
1246                          * The OI mapping is still there, the inode is still
1247                          * valid. It is just becaues the inode has no LMA EA.
1248                          */
1249                         if (saved_ino == id->oii_ino &&
1250                             saved_gen == id->oii_gen)
1251                                 goto found;
1252
1253                         /*
1254                          * It is the OI scrub updated the OI mapping by race.
1255                          * The new OI mapping must be valid.
1256                          */
1257                         trusted = true;
1258                         updated = true;
1259                         goto iget;
1260                 }
1261
1262                 /*
1263                  * "result == -ENOENT" means that the OI mappinghas been
1264                  * removed by race, so the inode belongs to other object.
1265                  *
1266                  * Others error can be returned  directly.
1267                  */
1268                 if (result == -ENOENT) {
1269                         LASSERT(trusted);
1270
1271                         obj->oo_inode = NULL;
1272                         result = 0;
1273                 }
1274         }
1275
1276         if (result != -EREMCHG)
1277                 GOTO(out, result);
1278
1279         LASSERT(!updated);
1280
1281         /*
1282          * if two OST objects map to the same inode, and inode mode is
1283          * (S_IFREG | S_ISUID | S_ISGID | S_ISVTX | 0666), which means it's
1284          * reserved by precreate, and not written yet, in this case, don't
1285          * set inode for the object whose FID mismatch, so that it can create
1286          * inode and not block precreate.
1287          */
1288         if (fid_is_on_ost(info, dev, fid, OI_CHECK_FLD) &&
1289             inode->i_mode == (S_IFREG | S_ISUID | S_ISGID | S_ISVTX | 0666)) {
1290                 obj->oo_inode = NULL;
1291                 GOTO(out, result = 0);
1292         }
1293
1294         result = osd_oi_lookup(info, dev, fid, id, OI_CHECK_FLD);
1295         /*
1296          * "result == -ENOENT" means the cached OI mapping has been removed
1297          * from the OI file by race, above inode belongs to other object.
1298          */
1299         if (result == -ENOENT) {
1300                 LASSERT(trusted);
1301
1302                 obj->oo_inode = NULL;
1303                 GOTO(out, result = 0);
1304         }
1305
1306         if (result)
1307                 GOTO(out, result);
1308
1309         if (saved_ino == id->oii_ino && saved_gen == id->oii_gen) {
1310                 result = -EREMCHG;
1311                 goto trigger;
1312         }
1313
1314         /*
1315          * It is the OI scrub updated the OI mapping by race.
1316          * The new OI mapping must be valid.
1317          */
1318         trusted = true;
1319         updated = true;
1320         goto iget;
1321
1322 found:
1323         if (!checked) {
1324                 struct lustre_ost_attrs *loa = &info->oti_ost_attrs;
1325                 struct lustre_mdt_attrs *lma = &info->oti_ost_attrs.loa_lma;
1326
1327                 result = osd_get_lma(info, inode, &info->oti_obj_dentry, loa);
1328                 if (!result) {
1329                         if (lma->lma_compat & LMAC_STRIPE_INFO &&
1330                             dev->od_is_ost)
1331                                 obj->oo_pfid_in_lma = 1;
1332                         if (unlikely(lma->lma_incompat & LMAI_REMOTE_PARENT) &&
1333                             !dev->od_is_ost)
1334                                 lu_object_set_agent_entry(&obj->oo_dt.do_lu);
1335                 } else if (result != -ENODATA) {
1336                         GOTO(out, result);
1337                 }
1338         }
1339
1340         obj->oo_compat_dot_created = 1;
1341         obj->oo_compat_dotdot_created = 1;
1342
1343         if (S_ISDIR(inode->i_mode) &&
1344             (flags & SS_AUTO_PARTIAL || sf->sf_status == SS_SCANNING))
1345                 osd_check_lmv(info, dev, inode, oic);
1346
1347         result = osd_attach_jinode(inode);
1348         if (result)
1349                 GOTO(out, result);
1350
1351         if (!ldiskfs_pdo)
1352                 GOTO(out, result = 0);
1353
1354         LASSERT(!obj->oo_hl_head);
1355         obj->oo_hl_head = ldiskfs_htree_lock_head_alloc(HTREE_HBITS_DEF);
1356
1357         GOTO(out, result = (!obj->oo_hl_head ? -ENOMEM : 0));
1358
1359 out:
1360         if (result || !obj->oo_inode) {
1361                 if (!IS_ERR_OR_NULL(inode))
1362                         iput(inode);
1363
1364                 obj->oo_inode = NULL;
1365                 if (trusted)
1366                         fid_zero(&oic->oic_fid);
1367         }
1368
1369         LINVRNT(osd_invariant(obj));
1370         return result;
1371 }
1372
1373 /*
1374  * Concurrency: shouldn't matter.
1375  */
1376 static void osd_object_init0(struct osd_object *obj)
1377 {
1378         LASSERT(obj->oo_inode != NULL);
1379         obj->oo_dt.do_body_ops = &osd_body_ops;
1380         obj->oo_dt.do_lu.lo_header->loh_attr |=
1381                 (LOHA_EXISTS | (obj->oo_inode->i_mode & S_IFMT));
1382 }
1383
1384 /*
1385  * Concurrency: no concurrent access is possible that early in object
1386  * life-cycle.
1387  */
1388 static int osd_object_init(const struct lu_env *env, struct lu_object *l,
1389                            const struct lu_object_conf *conf)
1390 {
1391         struct osd_object *obj = osd_obj(l);
1392         int result;
1393
1394         LINVRNT(osd_invariant(obj));
1395
1396         if (OBD_FAIL_PRECHECK(OBD_FAIL_MDS_LLOG_UMOUNT_RACE) &&
1397             cfs_fail_val == 2) {
1398                 struct osd_thread_info *info = osd_oti_get(env);
1399                 struct osd_idmap_cache *oic = &info->oti_cache;
1400                 /* invalidate thread cache */
1401                 memset(&oic->oic_fid, 0, sizeof(oic->oic_fid));
1402         }
1403         if (fid_is_otable_it(&l->lo_header->loh_fid)) {
1404                 obj->oo_dt.do_ops = &osd_obj_otable_it_ops;
1405                 l->lo_header->loh_attr |= LOHA_EXISTS;
1406                 return 0;
1407         }
1408
1409         result = osd_fid_lookup(env, obj, lu_object_fid(l), conf);
1410         obj->oo_dt.do_body_ops = &osd_body_ops_new;
1411         if (result == 0 && obj->oo_inode != NULL) {
1412                 struct osd_thread_info *oti = osd_oti_get(env);
1413                 struct lustre_ost_attrs *loa = &oti->oti_ost_attrs;
1414
1415                 osd_object_init0(obj);
1416                 if (unlikely(obj->oo_header))
1417                         return 0;
1418
1419                 result = osd_get_lma(oti, obj->oo_inode,
1420                                      &oti->oti_obj_dentry, loa);
1421                 if (!result) {
1422                         /*
1423                          * Convert LMAI flags to lustre LMA flags
1424                          * and cache it to oo_lma_flags
1425                          */
1426                         obj->oo_lma_flags =
1427                                 lma_to_lustre_flags(loa->loa_lma.lma_incompat);
1428                 } else if (result == -ENODATA) {
1429                         result = 0;
1430                 }
1431         }
1432         obj->oo_dirent_count = LU_DIRENT_COUNT_UNSET;
1433
1434         LINVRNT(osd_invariant(obj));
1435         return result;
1436 }
1437
1438 /*
1439  * The first part of oxe_buf is xattr name, and is '\0' terminated.
1440  * The left part is for value, binary mode.
1441  */
1442 struct osd_xattr_entry {
1443         struct list_head        oxe_list;
1444         size_t                  oxe_len;
1445         size_t                  oxe_namelen;
1446         bool                    oxe_exist;
1447         struct rcu_head         oxe_rcu;
1448         char                    oxe_buf[0];
1449 };
1450
1451 static int osd_oxc_get(struct osd_object *obj, const char *name,
1452                        struct lu_buf *buf)
1453 {
1454         struct osd_xattr_entry *tmp;
1455         struct osd_xattr_entry *oxe = NULL;
1456         size_t namelen = strlen(name);
1457         int rc;
1458
1459         rcu_read_lock();
1460         list_for_each_entry_rcu(tmp, &obj->oo_xattr_list, oxe_list) {
1461                 if (namelen == tmp->oxe_namelen &&
1462                     strncmp(name, tmp->oxe_buf, namelen) == 0) {
1463                         oxe = tmp;
1464                         break;
1465                 }
1466         }
1467
1468         if (oxe == NULL)
1469                 GOTO(out, rc = -ENOENT);
1470
1471         if (!oxe->oxe_exist)
1472                 GOTO(out, rc = -ENODATA);
1473
1474         /* vallen */
1475         rc = oxe->oxe_len - sizeof(*oxe) - oxe->oxe_namelen - 1;
1476         LASSERT(rc > 0);
1477
1478         if (buf->lb_buf == NULL)
1479                 GOTO(out, rc);
1480
1481         if (buf->lb_len < rc)
1482                 GOTO(out, rc = -ERANGE);
1483
1484         memcpy(buf->lb_buf, &oxe->oxe_buf[namelen + 1], rc);
1485 out:
1486         rcu_read_unlock();
1487
1488         return rc;
1489 }
1490
1491 static void osd_oxc_free(struct rcu_head *head)
1492 {
1493         struct osd_xattr_entry *oxe;
1494
1495         oxe = container_of(head, struct osd_xattr_entry, oxe_rcu);
1496         OBD_FREE(oxe, oxe->oxe_len);
1497 }
1498
1499 static void osd_oxc_add(struct osd_object *obj, const char *name,
1500                         const char *buf, int buflen)
1501 {
1502         struct osd_xattr_entry *oxe;
1503         struct osd_xattr_entry *old = NULL;
1504         struct osd_xattr_entry *tmp;
1505         size_t namelen = strlen(name);
1506         size_t len = sizeof(*oxe) + namelen + 1 + buflen;
1507
1508         OBD_ALLOC(oxe, len);
1509         if (oxe == NULL)
1510                 return;
1511
1512         INIT_LIST_HEAD(&oxe->oxe_list);
1513         oxe->oxe_len = len;
1514         oxe->oxe_namelen = namelen;
1515         memcpy(oxe->oxe_buf, name, namelen);
1516         if (buflen > 0) {
1517                 LASSERT(buf != NULL);
1518                 memcpy(oxe->oxe_buf + namelen + 1, buf, buflen);
1519                 oxe->oxe_exist = true;
1520         } else {
1521                 oxe->oxe_exist = false;
1522         }
1523
1524         /* this should be rarely called, just remove old and add new */
1525         spin_lock(&obj->oo_guard);
1526         list_for_each_entry(tmp, &obj->oo_xattr_list, oxe_list) {
1527                 if (namelen == tmp->oxe_namelen &&
1528                     strncmp(name, tmp->oxe_buf, namelen) == 0) {
1529                         old = tmp;
1530                         break;
1531                 }
1532         }
1533         if (old != NULL) {
1534                 list_replace_rcu(&old->oxe_list, &oxe->oxe_list);
1535                 call_rcu(&old->oxe_rcu, osd_oxc_free);
1536         } else {
1537                 list_add_tail_rcu(&oxe->oxe_list, &obj->oo_xattr_list);
1538         }
1539         spin_unlock(&obj->oo_guard);
1540 }
1541
1542 static void osd_oxc_del(struct osd_object *obj, const char *name)
1543 {
1544         struct osd_xattr_entry *oxe;
1545         size_t namelen = strlen(name);
1546
1547         spin_lock(&obj->oo_guard);
1548         list_for_each_entry(oxe, &obj->oo_xattr_list, oxe_list) {
1549                 if (namelen == oxe->oxe_namelen &&
1550                     strncmp(name, oxe->oxe_buf, namelen) == 0) {
1551                         list_del_rcu(&oxe->oxe_list);
1552                         call_rcu(&oxe->oxe_rcu, osd_oxc_free);
1553                         break;
1554                 }
1555         }
1556         spin_unlock(&obj->oo_guard);
1557 }
1558
1559 static void osd_oxc_fini(struct osd_object *obj)
1560 {
1561         struct osd_xattr_entry *oxe, *next;
1562
1563         list_for_each_entry_safe(oxe, next, &obj->oo_xattr_list, oxe_list) {
1564                 list_del(&oxe->oxe_list);
1565                 OBD_FREE(oxe, oxe->oxe_len);
1566         }
1567 }
1568
1569 /*
1570  * Concurrency: no concurrent access is possible that late in object
1571  * life-cycle.
1572  */
1573 static void osd_object_free(const struct lu_env *env, struct lu_object *l)
1574 {
1575         struct osd_object *obj = osd_obj(l);
1576         struct lu_object_header *h = obj->oo_header;
1577
1578         LINVRNT(osd_invariant(obj));
1579
1580         osd_oxc_fini(obj);
1581         dt_object_fini(&obj->oo_dt);
1582         if (obj->oo_hl_head != NULL)
1583                 ldiskfs_htree_lock_head_free(obj->oo_hl_head);
1584         /* obj doesn't contain an lu_object_header, so we don't need call_rcu */
1585         OBD_FREE_PTR(obj);
1586         if (unlikely(h))
1587                 lu_object_header_free(h);
1588 }
1589
1590 /*
1591  * Concurrency: no concurrent access is possible that late in object
1592  * life-cycle.
1593  */
1594 static void osd_index_fini(struct osd_object *o)
1595 {
1596         struct iam_container *bag;
1597
1598         if (o->oo_dir != NULL) {
1599                 bag = &o->oo_dir->od_container;
1600                 if (o->oo_inode != NULL) {
1601                         if (bag->ic_object == o->oo_inode)
1602                                 iam_container_fini(bag);
1603                 }
1604                 OBD_FREE_PTR(o->oo_dir);
1605                 o->oo_dir = NULL;
1606         }
1607 }
1608
1609 enum {
1610         OSD_TXN_OI_DELETE_CREDITS    = 20,
1611         OSD_TXN_INODE_DELETE_CREDITS = 20
1612 };
1613
1614 /*
1615  * Journal
1616  */
1617
1618 #if OSD_THANDLE_STATS
1619 /**
1620  * Set time when the handle is allocated
1621  */
1622 static void osd_th_alloced(struct osd_thandle *oth)
1623 {
1624         oth->oth_alloced = ktime_get();
1625 }
1626
1627 /**
1628  * Set time when the handle started
1629  */
1630 static void osd_th_started(struct osd_thandle *oth)
1631 {
1632         oth->oth_started = ktime_get();
1633 }
1634
1635 /**
1636  * Check whether the we deal with this handle for too long.
1637  */
1638 static void __osd_th_check_slow(void *oth, struct osd_device *dev,
1639                                 ktime_t alloced, ktime_t started,
1640                                 ktime_t closed)
1641 {
1642         ktime_t now = ktime_get();
1643
1644         LASSERT(dev != NULL);
1645
1646         lprocfs_counter_add(dev->od_stats, LPROC_OSD_THANDLE_STARTING,
1647                             ktime_us_delta(started, alloced));
1648         lprocfs_counter_add(dev->od_stats, LPROC_OSD_THANDLE_OPEN,
1649                             ktime_us_delta(closed, started));
1650         lprocfs_counter_add(dev->od_stats, LPROC_OSD_THANDLE_CLOSING,
1651                             ktime_us_delta(now, closed));
1652
1653         if (ktime_before(ktime_add_ns(alloced, 30 * NSEC_PER_SEC), now)) {
1654                 CWARN("transaction handle %p was open for too long: now %lld, alloced %lld, started %lld, closed %lld\n",
1655                                 oth, now, alloced, started, closed);
1656                 libcfs_debug_dumpstack(NULL);
1657         }
1658 }
1659
1660 #define OSD_CHECK_SLOW_TH(oth, dev, expr)                               \
1661 {                                                                       \
1662         ktime_t __closed = ktime_get();                                 \
1663         ktime_t __alloced = oth->oth_alloced;                           \
1664         ktime_t __started = oth->oth_started;                           \
1665                                                                         \
1666         expr;                                                           \
1667         __osd_th_check_slow(oth, dev, __alloced, __started, __closed);  \
1668 }
1669
1670 #else /* OSD_THANDLE_STATS */
1671
1672 #define osd_th_alloced(h)                  do {} while(0)
1673 #define osd_th_started(h)                  do {} while(0)
1674 #define OSD_CHECK_SLOW_TH(oth, dev, expr)  expr
1675
1676 #endif /* OSD_THANDLE_STATS */
1677
1678 /*
1679  * Concurrency: doesn't access mutable data.
1680  */
1681 static int osd_param_is_not_sane(const struct osd_device *dev,
1682                                  const struct thandle *th)
1683 {
1684         struct osd_thandle *oh = container_of(th, typeof(*oh), ot_super);
1685
1686         return oh->ot_credits > osd_transaction_size(dev);
1687 }
1688
1689 /*
1690  * Concurrency: shouldn't matter.
1691  */
1692 static void osd_trans_commit_cb(struct super_block *sb,
1693                                 struct ldiskfs_journal_cb_entry *jcb, int error)
1694 {
1695         struct osd_thandle *oh = container_of(jcb, struct osd_thandle, ot_jcb);
1696         struct thandle *th = &oh->ot_super;
1697         struct lu_device *lud = &th->th_dev->dd_lu_dev;
1698         struct dt_txn_commit_cb *dcb, *tmp;
1699
1700         LASSERT(oh->ot_handle == NULL);
1701
1702         if (error)
1703                 CERROR("transaction @0x%p commit error: %d\n", th, error);
1704
1705         OBD_FAIL_TIMEOUT(OBD_FAIL_OST_DELAY_TRANS, 40);
1706         /* call per-transaction callbacks if any */
1707         list_for_each_entry_safe(dcb, tmp, &oh->ot_commit_dcb_list,
1708                                  dcb_linkage) {
1709                 LASSERTF(dcb->dcb_magic == TRANS_COMMIT_CB_MAGIC,
1710                          "commit callback entry: magic=%x name='%s'\n",
1711                          dcb->dcb_magic, dcb->dcb_name);
1712                 list_del_init(&dcb->dcb_linkage);
1713                 dcb->dcb_func(NULL, th, dcb, error);
1714         }
1715
1716         lu_ref_del_at(&lud->ld_reference, &oh->ot_dev_link, "osd-tx", th);
1717         lu_device_put(lud);
1718         th->th_dev = NULL;
1719
1720         OBD_FREE_PTR(oh);
1721 }
1722
1723 static struct thandle *osd_trans_create(const struct lu_env *env,
1724                                         struct dt_device *d)
1725 {
1726         struct osd_thread_info *oti = osd_oti_get(env);
1727         struct osd_iobuf *iobuf = &oti->oti_iobuf;
1728         struct osd_thandle *oh;
1729         struct thandle *th;
1730
1731         ENTRY;
1732
1733         if (d->dd_rdonly) {
1734                 CERROR("%s: someone try to start transaction under "
1735                        "readonly mode, should be disabled.\n",
1736                        osd_name(osd_dt_dev(d)));
1737                 dump_stack();
1738                 RETURN(ERR_PTR(-EROFS));
1739         }
1740
1741         /* on pending IO in this thread should left from prev. request */
1742         LASSERT(atomic_read(&iobuf->dr_numreqs) == 0);
1743
1744         sb_start_write(osd_sb(osd_dt_dev(d)));
1745
1746         OBD_ALLOC_GFP(oh, sizeof(*oh), GFP_NOFS);
1747         if (!oh) {
1748                 sb_end_write(osd_sb(osd_dt_dev(d)));
1749                 RETURN(ERR_PTR(-ENOMEM));
1750         }
1751
1752         oh->ot_quota_trans = &oti->oti_quota_trans;
1753         memset(oh->ot_quota_trans, 0, sizeof(*oh->ot_quota_trans));
1754         th = &oh->ot_super;
1755         th->th_dev = d;
1756         th->th_result = 0;
1757         oh->ot_credits = 0;
1758         INIT_LIST_HEAD(&oh->ot_commit_dcb_list);
1759         INIT_LIST_HEAD(&oh->ot_stop_dcb_list);
1760         INIT_LIST_HEAD(&oh->ot_trunc_locks);
1761         osd_th_alloced(oh);
1762
1763         memset(oti->oti_declare_ops, 0,
1764                sizeof(oti->oti_declare_ops));
1765         memset(oti->oti_declare_ops_cred, 0,
1766                sizeof(oti->oti_declare_ops_cred));
1767         memset(oti->oti_declare_ops_used, 0,
1768                sizeof(oti->oti_declare_ops_used));
1769
1770         oti->oti_ins_cache_depth++;
1771
1772         RETURN(th);
1773 }
1774
1775 void osd_trans_dump_creds(const struct lu_env *env, struct thandle *th)
1776 {
1777         struct osd_thread_info *oti = osd_oti_get(env);
1778         struct osd_thandle *oh;
1779
1780         oh = container_of(th, struct osd_thandle, ot_super);
1781         LASSERT(oh != NULL);
1782
1783         CWARN("  create: %u/%u/%u, destroy: %u/%u/%u\n",
1784               oti->oti_declare_ops[OSD_OT_CREATE],
1785               oti->oti_declare_ops_cred[OSD_OT_CREATE],
1786               oti->oti_declare_ops_used[OSD_OT_CREATE],
1787               oti->oti_declare_ops[OSD_OT_DESTROY],
1788               oti->oti_declare_ops_cred[OSD_OT_DESTROY],
1789               oti->oti_declare_ops_used[OSD_OT_DESTROY]);
1790         CWARN("  attr_set: %u/%u/%u, xattr_set: %u/%u/%u\n",
1791               oti->oti_declare_ops[OSD_OT_ATTR_SET],
1792               oti->oti_declare_ops_cred[OSD_OT_ATTR_SET],
1793               oti->oti_declare_ops_used[OSD_OT_ATTR_SET],
1794               oti->oti_declare_ops[OSD_OT_XATTR_SET],
1795               oti->oti_declare_ops_cred[OSD_OT_XATTR_SET],
1796               oti->oti_declare_ops_used[OSD_OT_XATTR_SET]);
1797         CWARN("  write: %u/%u/%u, punch: %u/%u/%u, quota %u/%u/%u\n",
1798               oti->oti_declare_ops[OSD_OT_WRITE],
1799               oti->oti_declare_ops_cred[OSD_OT_WRITE],
1800               oti->oti_declare_ops_used[OSD_OT_WRITE],
1801               oti->oti_declare_ops[OSD_OT_PUNCH],
1802               oti->oti_declare_ops_cred[OSD_OT_PUNCH],
1803               oti->oti_declare_ops_used[OSD_OT_PUNCH],
1804               oti->oti_declare_ops[OSD_OT_QUOTA],
1805               oti->oti_declare_ops_cred[OSD_OT_QUOTA],
1806               oti->oti_declare_ops_used[OSD_OT_QUOTA]);
1807         CWARN("  insert: %u/%u/%u, delete: %u/%u/%u\n",
1808               oti->oti_declare_ops[OSD_OT_INSERT],
1809               oti->oti_declare_ops_cred[OSD_OT_INSERT],
1810               oti->oti_declare_ops_used[OSD_OT_INSERT],
1811               oti->oti_declare_ops[OSD_OT_DELETE],
1812               oti->oti_declare_ops_cred[OSD_OT_DELETE],
1813               oti->oti_declare_ops_used[OSD_OT_DELETE]);
1814         CWARN("  ref_add: %u/%u/%u, ref_del: %u/%u/%u\n",
1815               oti->oti_declare_ops[OSD_OT_REF_ADD],
1816               oti->oti_declare_ops_cred[OSD_OT_REF_ADD],
1817               oti->oti_declare_ops_used[OSD_OT_REF_ADD],
1818               oti->oti_declare_ops[OSD_OT_REF_DEL],
1819               oti->oti_declare_ops_cred[OSD_OT_REF_DEL],
1820               oti->oti_declare_ops_used[OSD_OT_REF_DEL]);
1821 }
1822
1823 /*
1824  * Concurrency: shouldn't matter.
1825  */
1826 static int osd_trans_start(const struct lu_env *env, struct dt_device *d,
1827                            struct thandle *th)
1828 {
1829         struct osd_thread_info *oti = osd_oti_get(env);
1830         struct osd_device *dev = osd_dt_dev(d);
1831         handle_t *jh;
1832         struct osd_thandle *oh;
1833         int rc;
1834
1835         ENTRY;
1836
1837         LASSERT(current->journal_info == NULL);
1838
1839         oh = container_of(th, struct osd_thandle, ot_super);
1840         LASSERT(oh != NULL);
1841         LASSERT(oh->ot_handle == NULL);
1842
1843         rc = dt_txn_hook_start(env, d, th);
1844         if (rc != 0)
1845                 GOTO(out, rc);
1846
1847         if (unlikely(osd_param_is_not_sane(dev, th))) {
1848                 static unsigned long last_printed;
1849                 static int last_credits;
1850
1851                 /*
1852                  * don't make noise on a tiny testing systems
1853                  * actual credits misuse will be caught anyway
1854                  */
1855                 if (last_credits != oh->ot_credits &&
1856                     time_after(jiffies, last_printed +
1857                                cfs_time_seconds(60)) &&
1858                     osd_transaction_size(dev) > 512) {
1859                         CWARN("%s: credits %u > trans_max %u\n", osd_name(dev),
1860                               oh->ot_credits, osd_transaction_size(dev));
1861                         osd_trans_dump_creds(env, th);
1862                         libcfs_debug_dumpstack(NULL);
1863                         last_credits = oh->ot_credits;
1864                         last_printed = jiffies;
1865                 }
1866                 /*
1867                  * XXX Limit the credits to 'max_transaction_buffers', and
1868                  *     let the underlying filesystem to catch the error if
1869                  *     we really need so many credits.
1870                  *
1871                  *     This should be removed when we can calculate the
1872                  *     credits precisely.
1873                  */
1874                 oh->ot_credits = osd_transaction_size(dev);
1875         } else if (ldiskfs_track_declares_assert != 0) {
1876                 /*
1877                  * reserve few credits to prevent an assertion in JBD
1878                  * our debugging mechanism will be able to detected
1879                  * overuse. this can help to debug single-update
1880                  * transactions
1881                  */
1882                 oh->ot_credits += 10;
1883                 if (unlikely(osd_param_is_not_sane(dev, th)))
1884                         oh->ot_credits = osd_transaction_size(dev);
1885         }
1886
1887         if (OBD_FAIL_CHECK(OBD_FAIL_OSD_TXN_START))
1888                 GOTO(out, rc = -EIO);
1889
1890         /*
1891          * XXX temporary stuff. Some abstraction layer should
1892          * be used.
1893          */
1894         jh = osd_journal_start_sb(osd_sb(dev), LDISKFS_HT_MISC, oh->ot_credits);
1895         osd_th_started(oh);
1896         if (!IS_ERR(jh)) {
1897                 oh->ot_handle = jh;
1898                 LASSERT(oti->oti_txns == 0);
1899
1900                 lu_device_get(&d->dd_lu_dev);
1901                 lu_ref_add_at(&d->dd_lu_dev.ld_reference, &oh->ot_dev_link,
1902                               "osd-tx", th);
1903                 oti->oti_txns++;
1904                 rc = 0;
1905         } else {
1906                 rc = PTR_ERR(jh);
1907         }
1908 out:
1909         RETURN(rc);
1910 }
1911
1912 static int osd_seq_exists(const struct lu_env *env,
1913                           struct osd_device *osd, u64 seq)
1914 {
1915         struct lu_seq_range *range = &osd_oti_get(env)->oti_seq_range;
1916         struct seq_server_site *ss = osd_seq_site(osd);
1917         int rc;
1918
1919         ENTRY;
1920
1921         LASSERT(ss != NULL);
1922         LASSERT(ss->ss_server_fld != NULL);
1923
1924         rc = osd_fld_lookup(env, osd, seq, range);
1925         if (rc != 0) {
1926                 if (rc != -ENOENT)
1927                         CERROR("%s: can't lookup FLD sequence %#llx: rc = %d\n",
1928                                osd_name(osd), seq, rc);
1929                 RETURN(0);
1930         }
1931
1932         RETURN(ss->ss_node_id == range->lsr_index);
1933 }
1934
1935 static void osd_trans_stop_cb(struct osd_thandle *oth, int result)
1936 {
1937         struct dt_txn_commit_cb *dcb;
1938         struct dt_txn_commit_cb *tmp;
1939
1940         /* call per-transaction stop callbacks if any */
1941         list_for_each_entry_safe(dcb, tmp, &oth->ot_stop_dcb_list,
1942                                  dcb_linkage) {
1943                 LASSERTF(dcb->dcb_magic == TRANS_COMMIT_CB_MAGIC,
1944                          "commit callback entry: magic=%x name='%s'\n",
1945                          dcb->dcb_magic, dcb->dcb_name);
1946                 list_del_init(&dcb->dcb_linkage);
1947                 dcb->dcb_func(NULL, &oth->ot_super, dcb, result);
1948         }
1949 }
1950
1951 /*
1952  * Concurrency: shouldn't matter.
1953  */
1954 static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt,
1955                           struct thandle *th)
1956 {
1957         struct osd_thread_info *oti = osd_oti_get(env);
1958         struct osd_thandle *oh;
1959         struct osd_iobuf *iobuf = &oti->oti_iobuf;
1960         struct osd_device *osd = osd_dt_dev(th->th_dev);
1961         struct qsd_instance *qsd = osd_def_qsd(osd);
1962         struct lquota_trans *qtrans;
1963         LIST_HEAD(truncates);
1964         int rc = 0, remove_agents = 0;
1965
1966         ENTRY;
1967
1968         oh = container_of(th, struct osd_thandle, ot_super);
1969
1970         remove_agents = oh->ot_remove_agents;
1971
1972         qtrans = oh->ot_quota_trans;
1973         oh->ot_quota_trans = NULL;
1974
1975         /* move locks to local list, stop tx, execute truncates */
1976         list_splice(&oh->ot_trunc_locks, &truncates);
1977
1978         if (oh->ot_handle != NULL) {
1979                 int rc2;
1980
1981                 handle_t *hdl = oh->ot_handle;
1982
1983                 /*
1984                  * add commit callback
1985                  * notice we don't do this in osd_trans_start()
1986                  * as underlying transaction can change during truncate
1987                  */
1988                 ldiskfs_journal_callback_add(hdl, osd_trans_commit_cb,
1989                                              &oh->ot_jcb);
1990
1991                 LASSERT(oti->oti_txns == 1);
1992                 oti->oti_txns--;
1993
1994                 rc = dt_txn_hook_stop(env, th);
1995                 if (rc != 0)
1996                         CERROR("%s: failed in transaction hook: rc = %d\n",
1997                                osd_name(osd), rc);
1998
1999                 osd_trans_stop_cb(oh, rc);
2000                 /* hook functions might modify th_sync */
2001                 hdl->h_sync = th->th_sync;
2002
2003                 oh->ot_handle = NULL;
2004                 OSD_CHECK_SLOW_TH(oh, osd, rc2 = ldiskfs_journal_stop(hdl));
2005                 if (rc2 != 0)
2006                         CERROR("%s: failed to stop transaction: rc = %d\n",
2007                                osd_name(osd), rc2);
2008                 if (!rc)
2009                         rc = rc2;
2010
2011                 osd_process_truncates(&truncates);
2012         } else {
2013                 osd_trans_stop_cb(oh, th->th_result);
2014                 OBD_FREE_PTR(oh);
2015         }
2016
2017         osd_trunc_unlock_all(env, &truncates);
2018
2019         /* inform the quota slave device that the transaction is stopping */
2020         qsd_op_end(env, qsd, qtrans);
2021
2022         /*
2023          * as we want IO to journal and data IO be concurrent, we don't block
2024          * awaiting data IO completion in osd_do_bio(), instead we wait here
2025          * once transaction is submitted to the journal. all reqular requests
2026          * don't do direct IO (except read/write), thus this wait_event becomes
2027          * no-op for them.
2028          *
2029          * IMPORTANT: we have to wait till any IO submited by the thread is
2030          * completed otherwise iobuf may be corrupted by different request
2031          */
2032         wait_event(iobuf->dr_wait,
2033                        atomic_read(&iobuf->dr_numreqs) == 0);
2034         osd_fini_iobuf(osd, iobuf);
2035         if (!rc)
2036                 rc = iobuf->dr_error;
2037
2038         if (unlikely(remove_agents != 0))
2039                 osd_process_scheduled_agent_removals(env, osd);
2040
2041         oti->oti_ins_cache_depth--;
2042         /* reset OI cache for safety */
2043         if (oti->oti_ins_cache_depth == 0)
2044                 oti->oti_ins_cache_used = 0;
2045
2046         sb_end_write(osd_sb(osd));
2047
2048         RETURN(rc);
2049 }
2050
2051 static int osd_trans_cb_add(struct thandle *th, struct dt_txn_commit_cb *dcb)
2052 {
2053         struct osd_thandle *oh = container_of(th, struct osd_thandle,
2054                                               ot_super);
2055
2056         LASSERT(dcb->dcb_magic == TRANS_COMMIT_CB_MAGIC);
2057         LASSERT(&dcb->dcb_func != NULL);
2058         if (dcb->dcb_flags & DCB_TRANS_STOP)
2059                 list_add(&dcb->dcb_linkage, &oh->ot_stop_dcb_list);
2060         else
2061                 list_add(&dcb->dcb_linkage, &oh->ot_commit_dcb_list);
2062
2063         return 0;
2064 }
2065
2066 /*
2067  * Called just before object is freed. Releases all resources except for
2068  * object itself (that is released by osd_object_free()).
2069  *
2070  * Concurrency: no concurrent access is possible that late in object
2071  * life-cycle.
2072  */
2073 static void osd_object_delete(const struct lu_env *env, struct lu_object *l)
2074 {
2075         struct osd_object *obj = osd_obj(l);
2076         struct qsd_instance *qsd = osd_def_qsd(osd_obj2dev(obj));
2077         struct inode *inode = obj->oo_inode;
2078         __u64 projid;
2079         qid_t uid;
2080         qid_t gid;
2081
2082         LINVRNT(osd_invariant(obj));
2083
2084         /*
2085          * If object is unlinked remove fid->ino mapping from object index.
2086          */
2087
2088         osd_index_fini(obj);
2089
2090         if (!inode)
2091                 return;
2092
2093         if (osd_has_index(obj) &&  obj->oo_dt.do_index_ops == &osd_index_iam_ops)
2094                 ldiskfs_set_inode_flag(inode, LDISKFS_INODE_JOURNAL_DATA);
2095
2096         uid = i_uid_read(inode);
2097         gid = i_gid_read(inode);
2098         projid = i_projid_read(inode);
2099
2100         obj->oo_inode = NULL;
2101         iput(inode);
2102
2103         /* do not rebalance quota if the caller needs to release memory
2104          * otherwise qsd_refresh_usage() may went into a new ldiskfs
2105          * transaction and risk to deadlock - LU-12178 */
2106         if (current->flags & (PF_MEMALLOC | PF_KSWAPD))
2107                 return;
2108
2109         if (!obj->oo_header && qsd) {
2110                 struct osd_thread_info *info = osd_oti_get(env);
2111                 struct lquota_id_info *qi = &info->oti_qi;
2112
2113                 /* Release granted quota to master if necessary */
2114                 qi->lqi_id.qid_uid = uid;
2115                 qsd_op_adjust(env, qsd, &qi->lqi_id, USRQUOTA);
2116
2117                 qi->lqi_id.qid_uid = gid;
2118                 qsd_op_adjust(env, qsd, &qi->lqi_id, GRPQUOTA);
2119
2120                 qi->lqi_id.qid_uid = projid;
2121                 qsd_op_adjust(env, qsd, &qi->lqi_id, PRJQUOTA);
2122         }
2123 }
2124
2125 /*
2126  * Concurrency: ->loo_object_release() is called under site spin-lock.
2127  */
2128 static void osd_object_release(const struct lu_env *env,
2129                                struct lu_object *l)
2130 {
2131         struct osd_object *o = osd_obj(l);
2132
2133         /*
2134          * nobody should be releasing a non-destroyed object with nlink=0
2135          * the API allows this, but ldiskfs doesn't like and then report
2136          * this inode as deleted
2137          */
2138         LASSERT(!(o->oo_destroyed == 0 && o->oo_inode &&
2139                   o->oo_inode->i_nlink == 0));
2140 }
2141
2142 /*
2143  * Concurrency: shouldn't matter.
2144  */
2145 static int osd_object_print(const struct lu_env *env, void *cookie,
2146                             lu_printer_t p, const struct lu_object *l)
2147 {
2148         struct osd_object *o = osd_obj(l);
2149         struct iam_descr *d;
2150
2151         if (o->oo_dir != NULL)
2152                 d = o->oo_dir->od_container.ic_descr;
2153         else
2154                 d = NULL;
2155         return (*p)(env, cookie,
2156                     LUSTRE_OSD_LDISKFS_NAME"-object@%p(i:%p:%lu/%u)[%s]",
2157                     o, o->oo_inode,
2158                     o->oo_inode ? o->oo_inode->i_ino : 0UL,
2159                     o->oo_inode ? o->oo_inode->i_generation : 0,
2160                     d ? d->id_ops->id_name : "plain");
2161 }
2162
2163 /*
2164  * Concurrency: shouldn't matter.
2165  */
2166 int osd_statfs(const struct lu_env *env, struct dt_device *d,
2167                 struct obd_statfs *sfs, struct obd_statfs_info *info)
2168 {
2169         struct osd_device *osd = osd_dt_dev(d);
2170         struct super_block *sb = osd_sb(osd);
2171         struct kstatfs *ksfs;
2172         __u64 reserved;
2173         int result = 0;
2174
2175         if (unlikely(osd->od_mnt == NULL))
2176                 return -EINPROGRESS;
2177
2178         /* osd_lproc.c call this without env, allocate ksfs for that case */
2179         if (unlikely(env == NULL)) {
2180                 OBD_ALLOC_PTR(ksfs);
2181                 if (ksfs == NULL)
2182                         return -ENOMEM;
2183         } else {
2184                 ksfs = &osd_oti_get(env)->oti_ksfs;
2185         }
2186
2187         result = sb->s_op->statfs(sb->s_root, ksfs);
2188         if (result)
2189                 goto out;
2190
2191         statfs_pack(sfs, ksfs);
2192         if (unlikely(sb->s_flags & SB_RDONLY))
2193                 sfs->os_state |= OS_STATFS_READONLY;
2194
2195         sfs->os_state |= osd->od_nonrotational ? OS_STATFS_NONROT : 0;
2196
2197         if (ldiskfs_has_feature_extents(sb))
2198                 sfs->os_maxbytes = sb->s_maxbytes;
2199         else
2200                 sfs->os_maxbytes = LDISKFS_SB(sb)->s_bitmap_maxbytes;
2201
2202         /*
2203          * Reserve some space so to avoid fragmenting the filesystem too much.
2204          * Fragmentation not only impacts performance, but can also increase
2205          * metadata overhead significantly, causing grant calculation to be
2206          * wrong.
2207          *
2208          * Reserve 0.78% of total space, at least 8MB for small filesystems.
2209          */
2210         BUILD_BUG_ON(OSD_STATFS_RESERVED <= LDISKFS_MAX_BLOCK_SIZE);
2211         reserved = OSD_STATFS_RESERVED >> sb->s_blocksize_bits;
2212         if (likely(sfs->os_blocks >= reserved << OSD_STATFS_RESERVED_SHIFT))
2213                 reserved = sfs->os_blocks >> OSD_STATFS_RESERVED_SHIFT;
2214
2215         sfs->os_blocks -= reserved;
2216         sfs->os_bfree  -= min(reserved, sfs->os_bfree);
2217         sfs->os_bavail -= min(reserved, sfs->os_bavail);
2218
2219 out:
2220         if (unlikely(env == NULL))
2221                 OBD_FREE_PTR(ksfs);
2222         return result;
2223 }
2224
2225 /**
2226  * Estimate space needed for file creations. We assume the largest filename
2227  * which is 2^64 - 1, hence a filename of 20 chars.
2228  * This is 28 bytes per object which is 28MB for 1M objects ... no so bad.
2229  */
2230 #ifdef __LDISKFS_DIR_REC_LEN
2231 #define PER_OBJ_USAGE __LDISKFS_DIR_REC_LEN(20)
2232 #else
2233 #define PER_OBJ_USAGE LDISKFS_DIR_REC_LEN(20)
2234 #endif
2235
2236 /*
2237  * Concurrency: doesn't access mutable data.
2238  */
2239 static void osd_conf_get(const struct lu_env *env,
2240                          const struct dt_device *dev,
2241                          struct dt_device_param *param)
2242 {
2243         struct osd_device *d = osd_dt_dev(dev);
2244         struct super_block *sb = osd_sb(d);
2245         struct blk_integrity *bi = bdev_get_integrity(sb->s_bdev);
2246         const char *name;
2247         int ea_overhead;
2248
2249         /*
2250          * XXX should be taken from not-yet-existing fs abstraction layer.
2251          */
2252         param->ddp_max_name_len = LDISKFS_NAME_LEN;
2253         param->ddp_max_nlink    = LDISKFS_LINK_MAX;
2254         param->ddp_symlink_max  = sb->s_blocksize;
2255         param->ddp_mount_type   = LDD_MT_LDISKFS;
2256         if (ldiskfs_has_feature_extents(sb))
2257                 param->ddp_maxbytes = sb->s_maxbytes;
2258         else
2259                 param->ddp_maxbytes = LDISKFS_SB(sb)->s_bitmap_maxbytes;
2260         /*
2261          * inode are statically allocated, so per-inode space consumption
2262          * is the space consumed by the directory entry
2263          */
2264         param->ddp_inodespace     = PER_OBJ_USAGE;
2265         /*
2266          * EXT_INIT_MAX_LEN is the theoretical maximum extent size (32k blocks
2267          * is 128MB) which is unlikely to be hit in real life. Report a smaller
2268          * maximum length to not under-count the actual number of extents
2269          * needed for writing a file if there are sub-optimal block allocations.
2270          */
2271         param->ddp_max_extent_blks = EXT_INIT_MAX_LEN >> 1;
2272         /* worst-case extent insertion metadata overhead */
2273         param->ddp_extent_tax = 6 * LDISKFS_BLOCK_SIZE(sb);
2274         param->ddp_mntopts = 0;
2275         if (test_opt(sb, XATTR_USER))
2276                 param->ddp_mntopts |= MNTOPT_USERXATTR;
2277         if (test_opt(sb, POSIX_ACL))
2278                 param->ddp_mntopts |= MNTOPT_ACL;
2279
2280         /*
2281          * LOD might calculate the max stripe count based on max_ea_size,
2282          * so we need take account in the overhead as well,
2283          * xattr_header + magic + xattr_entry_head
2284          */
2285         ea_overhead = sizeof(struct ldiskfs_xattr_header) + sizeof(__u32) +
2286                       LDISKFS_XATTR_LEN(XATTR_NAME_MAX_LEN);
2287
2288 #if defined(LDISKFS_FEATURE_INCOMPAT_EA_INODE)
2289         if (ldiskfs_has_feature_ea_inode(sb))
2290                 param->ddp_max_ea_size = LDISKFS_XATTR_MAX_LARGE_EA_SIZE -
2291                                                                 ea_overhead;
2292         else
2293 #endif
2294                 param->ddp_max_ea_size = sb->s_blocksize - ea_overhead;
2295
2296         if (param->ddp_max_ea_size > OBD_MAX_EA_SIZE)
2297                 param->ddp_max_ea_size = OBD_MAX_EA_SIZE;
2298
2299         /*
2300          * Preferred RPC size for efficient disk IO.  4MB shows good
2301          * all-around performance for ldiskfs, but use bigalloc chunk size
2302          * by default if larger.
2303          */
2304 #if defined(LDISKFS_CLUSTER_SIZE)
2305         if (LDISKFS_CLUSTER_SIZE(sb) > DT_DEF_BRW_SIZE)
2306                 param->ddp_brw_size = LDISKFS_CLUSTER_SIZE(sb);
2307         else
2308 #endif
2309                 param->ddp_brw_size = DT_DEF_BRW_SIZE;
2310
2311         param->ddp_t10_cksum_type = 0;
2312         if (bi) {
2313                 unsigned short interval = blk_integrity_interval(bi);
2314                 name = blk_integrity_name(bi);
2315                 /*
2316                  * Expected values:
2317                  * T10-DIF-TYPE1-CRC
2318                  * T10-DIF-TYPE3-CRC
2319                  * T10-DIF-TYPE1-IP
2320                  * T10-DIF-TYPE3-IP
2321                  */
2322                 if (strncmp(name, "T10-DIF-TYPE",
2323                             sizeof("T10-DIF-TYPE") - 1) == 0) {
2324                         /* also skip "1/3-" at end */
2325                         const int type_off = sizeof("T10-DIF-TYPE.");
2326                         char type_number = name[type_off - 2];
2327
2328                         if (interval != 512 && interval != 4096) {
2329                                 CERROR("%s: unsupported T10PI sector size %u\n",
2330                                        d->od_svname, interval);
2331                         } else if (type_number != '1' && type_number != '3') {
2332                                 CERROR("%s: unsupported T10PI type %s\n",
2333                                        d->od_svname, name);
2334                         } else if (strcmp(name + type_off, "CRC") == 0) {
2335                                 d->od_t10_type = type_number == '1' ?
2336                                         OSD_T10_TYPE1_CRC : OSD_T10_TYPE3_CRC;
2337                                 param->ddp_t10_cksum_type = interval == 512 ?
2338                                         OBD_CKSUM_T10CRC512 :
2339                                         OBD_CKSUM_T10CRC4K;
2340                         } else if (strcmp(name + type_off, "IP") == 0) {
2341                                 d->od_t10_type = type_number == '1' ?
2342                                         OSD_T10_TYPE1_IP : OSD_T10_TYPE3_IP;
2343                                 param->ddp_t10_cksum_type = interval == 512 ?
2344                                         OBD_CKSUM_T10IP512 :
2345                                         OBD_CKSUM_T10IP4K;
2346                         } else {
2347                                 CERROR("%s: unsupported checksum type of "
2348                                        "T10PI type '%s'",
2349                                        d->od_svname, name);
2350                         }
2351
2352                 } else {
2353                         CERROR("%s: unsupported T10PI type '%s'",
2354                                d->od_svname, name);
2355                 }
2356         }
2357 }
2358
2359 static struct super_block *osd_mnt_sb_get(const struct dt_device *d)
2360 {
2361         return osd_sb(osd_dt_dev(d));
2362 }
2363
2364 /*
2365  * Concurrency: shouldn't matter.
2366  */
2367 static int osd_sync(const struct lu_env *env, struct dt_device *d)
2368 {
2369         int rc;
2370         struct super_block *s = osd_sb(osd_dt_dev(d));
2371         ENTRY;
2372
2373         down_read(&s->s_umount);
2374         rc = s->s_op->sync_fs(s, 1);
2375         up_read(&s->s_umount);
2376
2377         CDEBUG(D_CACHE, "%s: synced OSD: rc = %d\n", osd_dt_dev(d)->od_svname,
2378                rc);
2379
2380         return rc;
2381 }
2382
2383 /**
2384  * Start commit for OSD device.
2385  *
2386  * An implementation of dt_commit_async method for OSD device.
2387  * Asychronously starts underlayng fs sync and thereby a transaction
2388  * commit.
2389  *
2390  * \param env environment
2391  * \param d dt device
2392  *
2393  * \see dt_device_operations
2394  */
2395 static int osd_commit_async(const struct lu_env *env,
2396                             struct dt_device *d)
2397 {
2398         struct super_block *s = osd_sb(osd_dt_dev(d));
2399         int rc;
2400
2401         ENTRY;
2402
2403         CDEBUG(D_HA, "%s: async commit OSD\n", osd_dt_dev(d)->od_svname);
2404         down_read(&s->s_umount);
2405         rc = s->s_op->sync_fs(s, 0);
2406         up_read(&s->s_umount);
2407
2408         RETURN(rc);
2409 }
2410
2411 static int (*priv_security_file_alloc)(struct file *file);
2412
2413 int osd_security_file_alloc(struct file *file)
2414 {
2415         if (priv_security_file_alloc)
2416                 return priv_security_file_alloc(file);
2417         return 0;
2418 }
2419
2420 /*
2421  * Concurrency: shouldn't matter.
2422  */
2423 static int osd_ro(const struct lu_env *env, struct dt_device *d)
2424 {
2425         struct super_block *sb = osd_sb(osd_dt_dev(d));
2426         struct block_device *dev = sb->s_bdev;
2427         int rc = -EOPNOTSUPP;
2428
2429         ENTRY;
2430
2431         CERROR("%s: %lx CANNOT BE SET READONLY: rc = %d\n",
2432                osd_dt_dev(d)->od_svname, (long)dev, rc);
2433
2434         RETURN(rc);
2435 }
2436
2437 /**
2438  * Note: we do not count into QUOTA here.
2439  * If we mount with --data_journal we may need more.
2440  */
2441 const int osd_dto_credits_noquota[DTO_NR] = {
2442         /**
2443          * Insert.
2444          * INDEX_EXTRA_TRANS_BLOCKS(8) +
2445          * SINGLEDATA_TRANS_BLOCKS(8)
2446          * XXX Note: maybe iam need more, since iam have more level than
2447          *           EXT3 htree.
2448          */
2449         [DTO_INDEX_INSERT]  = 16,
2450         /**
2451          * Delete
2452          * just modify a single entry, probably merge few within a block
2453          */
2454         [DTO_INDEX_DELETE]  = 1,
2455         /**
2456          * Used for OI scrub
2457          */
2458         [DTO_INDEX_UPDATE]  = 16,
2459         /**
2460          * 4(inode, inode bits, groups, GDT)
2461          *   notice: OI updates are counted separately with DTO_INDEX_INSERT
2462          */
2463         [DTO_OBJECT_CREATE] = 4,
2464         /**
2465          * 4(inode, inode bits, groups, GDT)
2466          *   notice: OI updates are counted separately with DTO_INDEX_DELETE
2467          */
2468         [DTO_OBJECT_DELETE] = 4,
2469         /**
2470          * Attr set credits (inode)
2471          */
2472         [DTO_ATTR_SET_BASE] = 1,
2473         /**
2474          * Xattr set. The same as xattr of EXT3.
2475          * DATA_TRANS_BLOCKS(14)
2476          * XXX Note: in original MDS implmentation INDEX_EXTRA_TRANS_BLOCKS
2477          * are also counted in. Do not know why?
2478          */
2479         [DTO_XATTR_SET]     = 14,
2480         /**
2481          * credits for inode change during write.
2482          */
2483         [DTO_WRITE_BASE]    = 3,
2484         /**
2485          * credits for single block write.
2486          */
2487         [DTO_WRITE_BLOCK]   = 14,
2488         /**
2489          * Attr set credits for chown.
2490          * This is extra credits for setattr, and it is null without quota
2491          */
2492         [DTO_ATTR_SET_CHOWN] = 0
2493 };
2494
2495 static const struct dt_device_operations osd_dt_ops = {
2496         .dt_root_get       = osd_root_get,
2497         .dt_statfs         = osd_statfs,
2498         .dt_trans_create   = osd_trans_create,
2499         .dt_trans_start    = osd_trans_start,
2500         .dt_trans_stop     = osd_trans_stop,
2501         .dt_trans_cb_add   = osd_trans_cb_add,
2502         .dt_conf_get       = osd_conf_get,
2503         .dt_mnt_sb_get     = osd_mnt_sb_get,
2504         .dt_sync           = osd_sync,
2505         .dt_ro             = osd_ro,
2506         .dt_commit_async   = osd_commit_async,
2507 };
2508
2509 static void osd_read_lock(const struct lu_env *env, struct dt_object *dt,
2510                           unsigned int role)
2511 {
2512         struct osd_object *obj = osd_dt_obj(dt);
2513         struct osd_thread_info *oti = osd_oti_get(env);
2514
2515         LINVRNT(osd_invariant(obj));
2516
2517         LASSERT(obj->oo_owner != env);
2518         down_read_nested(&obj->oo_sem, role);
2519
2520         LASSERT(obj->oo_owner == NULL);
2521         oti->oti_r_locks++;
2522 }
2523
2524 static void osd_write_lock(const struct lu_env *env, struct dt_object *dt,
2525                            unsigned int role)
2526 {
2527         struct osd_object *obj = osd_dt_obj(dt);
2528         struct osd_thread_info *oti = osd_oti_get(env);
2529
2530         LINVRNT(osd_invariant(obj));
2531
2532         LASSERT(obj->oo_owner != env);
2533         down_write_nested(&obj->oo_sem, role);
2534
2535         LASSERT(obj->oo_owner == NULL);
2536         obj->oo_owner = env;
2537         oti->oti_w_locks++;
2538 }
2539
2540 static void osd_read_unlock(const struct lu_env *env, struct dt_object *dt)
2541 {
2542         struct osd_object *obj = osd_dt_obj(dt);
2543         struct osd_thread_info *oti = osd_oti_get(env);
2544
2545         LINVRNT(osd_invariant(obj));
2546
2547         LASSERT(oti->oti_r_locks > 0);
2548         oti->oti_r_locks--;
2549         up_read(&obj->oo_sem);
2550 }
2551
2552 static void osd_write_unlock(const struct lu_env *env, struct dt_object *dt)
2553 {
2554         struct osd_object *obj = osd_dt_obj(dt);
2555         struct osd_thread_info *oti = osd_oti_get(env);
2556
2557         LINVRNT(osd_invariant(obj));
2558
2559         LASSERT(obj->oo_owner == env);
2560         LASSERT(oti->oti_w_locks > 0);
2561         oti->oti_w_locks--;
2562         obj->oo_owner = NULL;
2563         up_write(&obj->oo_sem);
2564 }
2565
2566 static int osd_write_locked(const struct lu_env *env, struct dt_object *dt)
2567 {
2568         struct osd_object *obj = osd_dt_obj(dt);
2569
2570         LINVRNT(osd_invariant(obj));
2571
2572         return obj->oo_owner == env;
2573 }
2574
2575 static void osd_inode_getattr(const struct lu_env *env,
2576                               struct inode *inode, struct lu_attr *attr)
2577 {
2578         attr->la_valid  |= LA_ATIME | LA_MTIME | LA_CTIME | LA_MODE |
2579                            LA_SIZE | LA_BLOCKS | LA_UID | LA_GID |
2580                            LA_PROJID | LA_FLAGS | LA_NLINK | LA_RDEV |
2581                            LA_BLKSIZE | LA_TYPE | LA_BTIME;
2582
2583         attr->la_atime = inode->i_atime.tv_sec;
2584         attr->la_mtime = inode->i_mtime.tv_sec;
2585         attr->la_ctime = inode->i_ctime.tv_sec;
2586         attr->la_btime = LDISKFS_I(inode)->i_crtime.tv_sec;
2587         attr->la_mode    = inode->i_mode;
2588         attr->la_size    = i_size_read(inode);
2589         attr->la_blocks  = inode->i_blocks;
2590         attr->la_uid     = i_uid_read(inode);
2591         attr->la_gid     = i_gid_read(inode);
2592         attr->la_projid  = i_projid_read(inode);
2593         attr->la_flags   = ll_inode_to_ext_flags(inode->i_flags);
2594         attr->la_nlink   = inode->i_nlink;
2595         attr->la_rdev    = inode->i_rdev;
2596         attr->la_blksize = 1 << inode->i_blkbits;
2597         attr->la_blkbits = inode->i_blkbits;
2598         /*
2599          * Ext4 did not transfer inherit flags from raw inode
2600          * to inode flags, and ext4 internally test raw inode
2601          * @i_flags directly. Instead of patching ext4, we do it here.
2602          */
2603         if (LDISKFS_I(inode)->i_flags & LUSTRE_PROJINHERIT_FL)
2604                 attr->la_flags |= LUSTRE_PROJINHERIT_FL;
2605 }
2606
2607 static int osd_dirent_count(const struct lu_env *env, struct dt_object *dt,
2608                             u64 *count)
2609 {
2610         struct osd_object *obj = osd_dt_obj(dt);
2611         const struct dt_it_ops *iops;
2612         struct dt_it *it;
2613         int rc;
2614
2615         ENTRY;
2616
2617         LASSERT(S_ISDIR(obj->oo_inode->i_mode));
2618         LASSERT(fid_is_namespace_visible(lu_object_fid(&obj->oo_dt.do_lu)));
2619
2620         if (obj->oo_dirent_count != LU_DIRENT_COUNT_UNSET) {
2621                 *count = obj->oo_dirent_count;
2622                 RETURN(0);
2623         }
2624
2625         /* directory not initialized yet */
2626         if (!dt->do_index_ops) {
2627                 *count = 0;
2628                 RETURN(0);
2629         }
2630
2631         iops = &dt->do_index_ops->dio_it;
2632         it = iops->init(env, dt, LUDA_64BITHASH);
2633         if (IS_ERR(it))
2634                 RETURN(PTR_ERR(it));
2635
2636         rc = iops->load(env, it, 0);
2637         if (rc < 0) {
2638                 if (rc == -ENODATA) {
2639                         rc = 0;
2640                         *count = 0;
2641                 }
2642                 GOTO(out, rc);
2643         }
2644         if (rc > 0)
2645                 rc = iops->next(env, it);
2646
2647         for (*count = 0; rc == 0 || rc == -ESTALE; rc = iops->next(env, it)) {
2648                 if (rc == -ESTALE)
2649                         continue;
2650
2651                 if (iops->key_size(env, it) == 0)
2652                         continue;
2653
2654                 (*count)++;
2655         }
2656         if (rc == 1) {
2657                 obj->oo_dirent_count = *count;
2658                 rc = 0;
2659         }
2660 out:
2661         iops->put(env, it);
2662         iops->fini(env, it);
2663
2664         RETURN(rc);
2665 }
2666
2667 static int osd_attr_get(const struct lu_env *env, struct dt_object *dt,
2668                         struct lu_attr *attr)
2669 {
2670         struct osd_object *obj = osd_dt_obj(dt);
2671         int rc = 0;
2672
2673         if (unlikely(!dt_object_exists(dt)))
2674                 return -ENOENT;
2675         if (unlikely(obj->oo_destroyed))
2676                 return -ENOENT;
2677
2678         LASSERT(!dt_object_remote(dt));
2679         LINVRNT(osd_invariant(obj));
2680
2681         spin_lock(&obj->oo_guard);
2682         osd_inode_getattr(env, obj->oo_inode, attr);
2683         if (obj->oo_lma_flags & LUSTRE_ORPHAN_FL) {
2684                 attr->la_valid |= LA_FLAGS;
2685                 attr->la_flags |= LUSTRE_ORPHAN_FL;
2686         }
2687         if (obj->oo_lma_flags & LUSTRE_ENCRYPT_FL) {
2688                 attr->la_valid |= LA_FLAGS;
2689                 attr->la_flags |= LUSTRE_ENCRYPT_FL;
2690         }
2691         spin_unlock(&obj->oo_guard);
2692
2693         if (S_ISDIR(obj->oo_inode->i_mode) &&
2694             fid_is_namespace_visible(lu_object_fid(&dt->do_lu)))
2695                 rc = osd_dirent_count(env, dt, &attr->la_dirent_count);
2696
2697         return rc;
2698 }
2699
2700 static int osd_declare_attr_qid(const struct lu_env *env,
2701                                 struct osd_object *obj,
2702                                 struct osd_thandle *oh, long long bspace,
2703                                 qid_t old_id, qid_t new_id, bool enforce,
2704                                 unsigned int type, bool ignore_edquot)
2705 {
2706         int rc;
2707         struct osd_thread_info *info = osd_oti_get(env);
2708         struct lquota_id_info  *qi = &info->oti_qi;
2709
2710         qi->lqi_type = type;
2711         /* inode accounting */
2712         qi->lqi_is_blk = false;
2713
2714         /* one more inode for the new id ... */
2715         qi->lqi_id.qid_uid = new_id;
2716         qi->lqi_space      = 1;
2717         /* Reserve credits for the new id */
2718         rc = osd_declare_qid(env, oh, qi, NULL, enforce, NULL);
2719         if (ignore_edquot && (rc == -EDQUOT || rc == -EINPROGRESS))
2720                 rc = 0;
2721         if (rc)
2722                 RETURN(rc);
2723
2724         /* and one less inode for the current id */
2725         qi->lqi_id.qid_uid = old_id;
2726         qi->lqi_space = -1;
2727         rc = osd_declare_qid(env, oh, qi, obj, enforce, NULL);
2728         if (ignore_edquot && (rc == -EDQUOT || rc == -EINPROGRESS))
2729                 rc = 0;
2730         if (rc)
2731                 RETURN(rc);
2732
2733         /* block accounting */
2734         qi->lqi_is_blk = true;
2735
2736         /* more blocks for the new id ... */
2737         qi->lqi_id.qid_uid = new_id;
2738         qi->lqi_space      = bspace;
2739         /*
2740          * Credits for the new uid has been reserved, re-use "obj"
2741          * to save credit reservation.
2742          */
2743         rc = osd_declare_qid(env, oh, qi, obj, enforce, NULL);
2744         if (ignore_edquot && (rc == -EDQUOT || rc == -EINPROGRESS))
2745                 rc = 0;
2746         if (rc)
2747                 RETURN(rc);
2748
2749         /* and finally less blocks for the current uid */
2750         qi->lqi_id.qid_uid = old_id;
2751         qi->lqi_space      = -bspace;
2752         rc = osd_declare_qid(env, oh, qi, obj, enforce, NULL);
2753         if (ignore_edquot && (rc == -EDQUOT || rc == -EINPROGRESS))
2754                 rc = 0;
2755
2756         RETURN(rc);
2757 }
2758
2759 static int osd_declare_attr_set(const struct lu_env *env,
2760                                 struct dt_object *dt,
2761                                 const struct lu_attr *attr,
2762                                 struct thandle *handle)
2763 {
2764         struct osd_thandle *oh;
2765         struct osd_object *obj;
2766         qid_t uid;
2767         qid_t gid;
2768         long long bspace;
2769         int rc = 0;
2770         bool enforce;
2771
2772         ENTRY;
2773
2774         LASSERT(dt != NULL);
2775         LASSERT(handle != NULL);
2776
2777         obj = osd_dt_obj(dt);
2778         LASSERT(osd_invariant(obj));
2779
2780         oh = container_of(handle, struct osd_thandle, ot_super);
2781         LASSERT(oh->ot_handle == NULL);
2782
2783         osd_trans_declare_op(env, oh, OSD_OT_ATTR_SET,
2784                              osd_dto_credits_noquota[DTO_ATTR_SET_BASE]);
2785
2786         osd_trans_declare_op(env, oh, OSD_OT_XATTR_SET,
2787                              osd_dto_credits_noquota[DTO_XATTR_SET]);
2788
2789         if (attr == NULL || obj->oo_inode == NULL)
2790                 RETURN(rc);
2791
2792         bspace   = obj->oo_inode->i_blocks << 9;
2793         bspace   = toqb(bspace);
2794
2795         /*
2796          * Changing ownership is always preformed by super user, it should not
2797          * fail with EDQUOT unless required explicitly.
2798          *
2799          * We still need to call the osd_declare_qid() to calculate the journal
2800          * credits for updating quota accounting files and to trigger quota
2801          * space adjustment once the operation is completed.
2802          */
2803         if (attr->la_valid & LA_UID || attr->la_valid & LA_GID) {
2804                 bool ignore_edquot = !(attr->la_flags & LUSTRE_SET_SYNC_FL);
2805
2806                 if (!ignore_edquot)
2807                         CDEBUG(D_QUOTA, "%s: enforce quota on UID %u, GID %u"
2808                                "(the quota space is %lld)\n",
2809                                obj->oo_inode->i_sb->s_id, attr->la_uid,
2810                                attr->la_gid, bspace);
2811
2812                 /* USERQUOTA */
2813                 uid = i_uid_read(obj->oo_inode);
2814                 enforce = (attr->la_valid & LA_UID) && (attr->la_uid != uid);
2815                 rc = osd_declare_attr_qid(env, obj, oh, bspace, uid,
2816                                           attr->la_uid, enforce, USRQUOTA,
2817                                           true);
2818                 if (rc)
2819                         RETURN(rc);
2820
2821                 gid = i_gid_read(obj->oo_inode);
2822                 CDEBUG(D_QUOTA, "declare uid %d -> %d gid %d -> %d\n", uid,
2823                        attr->la_uid, gid, attr->la_gid);
2824                 enforce = (attr->la_valid & LA_GID) && (attr->la_gid != gid);
2825                 rc = osd_declare_attr_qid(env, obj, oh, bspace, gid,
2826                                           attr->la_gid, enforce, GRPQUOTA,
2827                                           ignore_edquot);
2828                 if (rc)
2829                         RETURN(rc);
2830
2831         }
2832 #ifdef HAVE_PROJECT_QUOTA
2833         if (attr->la_valid & LA_PROJID) {
2834                 __u32 projid = i_projid_read(obj->oo_inode);
2835
2836                 enforce = (attr->la_valid & LA_PROJID) &&
2837                                         (attr->la_projid != projid);
2838                 rc = osd_declare_attr_qid(env, obj, oh, bspace,
2839                                           (qid_t)projid, (qid_t)attr->la_projid,
2840                                           enforce, PRJQUOTA, true);
2841                 if (rc)
2842                         RETURN(rc);
2843         }
2844 #endif
2845         RETURN(rc);
2846 }
2847
2848 static int osd_inode_setattr(const struct lu_env *env,
2849                              struct inode *inode, const struct lu_attr *attr)
2850 {
2851         __u64 bits = attr->la_valid;
2852
2853         /* Only allow set size for regular file */
2854         if (!S_ISREG(inode->i_mode))
2855                 bits &= ~(LA_SIZE | LA_BLOCKS);
2856
2857         if (bits == 0)
2858                 return 0;
2859
2860         if (bits & LA_ATIME)
2861                 inode->i_atime = osd_inode_time(inode, attr->la_atime);
2862         if (bits & LA_CTIME)
2863                 inode->i_ctime = osd_inode_time(inode, attr->la_ctime);
2864         if (bits & LA_MTIME)
2865                 inode->i_mtime = osd_inode_time(inode, attr->la_mtime);
2866         if (bits & LA_SIZE) {
2867                 spin_lock(&inode->i_lock);
2868                 LDISKFS_I(inode)->i_disksize = attr->la_size;
2869                 i_size_write(inode, attr->la_size);
2870                 spin_unlock(&inode->i_lock);
2871         }
2872
2873         /*
2874          * OSD should not change "i_blocks" which is used by quota.
2875          * "i_blocks" should be changed by ldiskfs only.
2876          */
2877         if (bits & LA_MODE)
2878                 inode->i_mode = (inode->i_mode & S_IFMT) |
2879                                 (attr->la_mode & ~S_IFMT);
2880         if (bits & LA_UID)
2881                 i_uid_write(inode, attr->la_uid);
2882         if (bits & LA_GID)
2883                 i_gid_write(inode, attr->la_gid);
2884         if (bits & LA_PROJID)
2885                 i_projid_write(inode, attr->la_projid);
2886         if (bits & LA_NLINK)
2887                 set_nlink(inode, attr->la_nlink);
2888         if (bits & LA_RDEV)
2889                 inode->i_rdev = attr->la_rdev;
2890
2891         if (bits & LA_FLAGS) {
2892                 /* always keep S_NOCMTIME */
2893                 inode->i_flags = ll_ext_to_inode_flags(attr->la_flags) |
2894                                  S_NOCMTIME;
2895 #if defined(S_ENCRYPTED)
2896                 /* Always remove S_ENCRYPTED, because ldiskfs must not be
2897                  * aware of encryption status. It is just stored into LMA
2898                  * so that it can be forwared to client side.
2899                  */
2900                 inode->i_flags &= ~S_ENCRYPTED;
2901 #endif
2902                 /*
2903                  * Ext4 did not transfer inherit flags from
2904                  * @inode->i_flags to raw inode i_flags when writing
2905                  * flags, we do it explictly here.
2906                  */
2907                 if (attr->la_flags & LUSTRE_PROJINHERIT_FL)
2908                         LDISKFS_I(inode)->i_flags |= LUSTRE_PROJINHERIT_FL;
2909                 else
2910                         LDISKFS_I(inode)->i_flags &= ~LUSTRE_PROJINHERIT_FL;
2911         }
2912         return 0;
2913 }
2914
2915 #ifdef HAVE_PROJECT_QUOTA
2916 static int osd_transfer_project(struct inode *inode, __u32 projid,
2917                                 struct thandle *handle)
2918 {
2919         struct super_block *sb = inode->i_sb;
2920         struct ldiskfs_inode_info *ei = LDISKFS_I(inode);
2921         int err;
2922         kprojid_t kprojid;
2923         struct ldiskfs_iloc iloc;
2924         struct ldiskfs_inode *raw_inode;
2925         struct dquot *transfer_to[LDISKFS_MAXQUOTAS] = { };
2926
2927         if (!ldiskfs_has_feature_project(sb)) {
2928                 LASSERT(__kprojid_val(LDISKFS_I(inode)->i_projid)
2929                         == LDISKFS_DEF_PROJID);
2930                 if (projid != LDISKFS_DEF_PROJID)
2931                         return -EOPNOTSUPP;
2932                 else
2933                         return 0;
2934         }
2935
2936         if (LDISKFS_INODE_SIZE(sb) <= LDISKFS_GOOD_OLD_INODE_SIZE)
2937                 return -EOPNOTSUPP;
2938
2939         kprojid = make_kprojid(&init_user_ns, (projid_t)projid);
2940         if (projid_eq(kprojid, LDISKFS_I(inode)->i_projid))
2941                 return 0;
2942
2943         err = ldiskfs_get_inode_loc(inode, &iloc);
2944         if (err)
2945                 return err;
2946
2947         raw_inode = ldiskfs_raw_inode(&iloc);
2948         if (!LDISKFS_FITS_IN_INODE(raw_inode, ei, i_projid)) {
2949                 struct osd_thandle *oh = container_of(handle,
2950                                                       struct osd_thandle,
2951                                                       ot_super);
2952                 /**
2953                  * try to expand inode size automatically.
2954                  */
2955                 ldiskfs_mark_inode_dirty(oh->ot_handle, inode);
2956                 if (!LDISKFS_FITS_IN_INODE(raw_inode, ei, i_projid)) {
2957                         err = -EOVERFLOW;
2958                         brelse(iloc.bh);
2959                         return err;
2960                 }
2961         }
2962         brelse(iloc.bh);
2963
2964         dquot_initialize(inode);
2965         transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
2966         if (transfer_to[PRJQUOTA]) {
2967                 err = __dquot_transfer(inode, transfer_to);
2968                 dqput(transfer_to[PRJQUOTA]);
2969                 if (err)
2970                         return err;
2971         }
2972
2973         return err;
2974 }
2975 #endif
2976
2977 static int osd_quota_transfer(struct inode *inode, const struct lu_attr *attr,
2978                               struct thandle *handle)
2979 {
2980         int rc;
2981
2982         if ((attr->la_valid & LA_UID && attr->la_uid != i_uid_read(inode)) ||
2983             (attr->la_valid & LA_GID && attr->la_gid != i_gid_read(inode))) {
2984                 struct iattr iattr;
2985
2986                 CDEBUG(D_QUOTA,
2987                        "executing dquot_transfer inode %ld uid %d -> %d gid %d -> %d\n",
2988                        inode->i_ino, i_uid_read(inode), attr->la_uid,
2989                        i_gid_read(inode), attr->la_gid);
2990
2991                 dquot_initialize(inode);
2992                 iattr.ia_valid = 0;
2993                 if (attr->la_valid & LA_UID)
2994                         iattr.ia_valid |= ATTR_UID;
2995                 if (attr->la_valid & LA_GID)
2996                         iattr.ia_valid |= ATTR_GID;
2997                 iattr.ia_uid = make_kuid(&init_user_ns, attr->la_uid);
2998                 iattr.ia_gid = make_kgid(&init_user_ns, attr->la_gid);
2999
3000                 rc = dquot_transfer(inode, &iattr);
3001                 if (rc) {
3002                         CERROR("%s: quota transfer failed: rc = %d. Is quota "
3003                                "enforcement enabled on the ldiskfs "
3004                                "filesystem?\n", inode->i_sb->s_id, rc);
3005                         return rc;
3006                 }
3007         }
3008
3009         /* Handle project id transfer here properly */
3010         if (attr->la_valid & LA_PROJID &&
3011             attr->la_projid != i_projid_read(inode)) {
3012 #ifdef HAVE_PROJECT_QUOTA
3013                 rc = osd_transfer_project(inode, attr->la_projid, handle);
3014 #else
3015                 rc = -ENOTSUPP;
3016 #endif
3017                 if (rc) {
3018                         CERROR("%s: quota transfer failed: rc = %d. Is project "
3019                                "enforcement enabled on the ldiskfs "
3020                                "filesystem?\n", inode->i_sb->s_id, rc);
3021                         return rc;
3022                 }
3023         }
3024         return 0;
3025 }
3026
3027 static int osd_attr_set(const struct lu_env *env,
3028                         struct dt_object *dt,
3029                         const struct lu_attr *attr,
3030                         struct thandle *handle)
3031 {
3032         struct osd_object *obj = osd_dt_obj(dt);
3033         struct inode *inode;
3034         int rc;
3035
3036         if (!dt_object_exists(dt))
3037                 return -ENOENT;
3038
3039         LASSERT(handle != NULL);
3040         LASSERT(!dt_object_remote(dt));
3041         LASSERT(osd_invariant(obj));
3042
3043         osd_trans_exec_op(env, handle, OSD_OT_ATTR_SET);
3044
3045         if (OBD_FAIL_CHECK(OBD_FAIL_OSD_FID_MAPPING) &&
3046             !osd_obj2dev(obj)->od_is_ost) {
3047                 struct osd_thread_info *oti = osd_oti_get(env);
3048                 const struct lu_fid *fid0 = lu_object_fid(&dt->do_lu);
3049                 struct lu_fid *fid1 = &oti->oti_fid;
3050                 struct osd_inode_id *id = &oti->oti_id;
3051                 struct iam_path_descr *ipd;
3052                 struct iam_container *bag;
3053                 struct osd_thandle *oh;
3054                 int rc;
3055
3056                 fid_cpu_to_be(fid1, fid0);
3057                 memset(id, 1, sizeof(*id));
3058                 bag = &osd_fid2oi(osd_dev(dt->do_lu.lo_dev),
3059                                   fid0)->oi_dir.od_container;
3060                 ipd = osd_idx_ipd_get(env, bag);
3061                 if (unlikely(ipd == NULL))
3062                         RETURN(-ENOMEM);
3063
3064                 oh = container_of(handle, struct osd_thandle, ot_super);
3065                 rc = iam_update(oh->ot_handle, bag,
3066                                 (const struct iam_key *)fid1,
3067                                 (const struct iam_rec *)id, ipd);
3068                 osd_ipd_put(env, bag, ipd);
3069                 return(rc > 0 ? 0 : rc);
3070         }
3071
3072         inode = obj->oo_inode;
3073
3074         rc = osd_quota_transfer(inode, attr, handle);
3075         if (rc)
3076                 return rc;
3077
3078         spin_lock(&obj->oo_guard);
3079         rc = osd_inode_setattr(env, inode, attr);
3080         spin_unlock(&obj->oo_guard);
3081         if (rc != 0)
3082                 GOTO(out, rc);
3083
3084         osd_dirty_inode(inode, I_DIRTY_DATASYNC);
3085
3086         osd_trans_exec_check(env, handle, OSD_OT_ATTR_SET);
3087
3088         if (!(attr->la_valid & LA_FLAGS))
3089                 GOTO(out, rc);
3090
3091         /* Let's check if there are extra flags need to be set into LMA */
3092         if (attr->la_flags & LUSTRE_LMA_FL_MASKS) {
3093                 struct osd_thread_info *info = osd_oti_get(env);
3094                 struct lustre_mdt_attrs *lma = &info->oti_ost_attrs.loa_lma;
3095
3096                 LASSERT(!obj->oo_pfid_in_lma);
3097
3098                 rc = osd_get_lma(info, inode, &info->oti_obj_dentry,
3099                                  &info->oti_ost_attrs);
3100                 if (rc)
3101                         GOTO(out, rc);
3102
3103                 lma->lma_incompat |=
3104                         lustre_to_lma_flags(attr->la_flags);
3105                 lustre_lma_swab(lma);
3106
3107                 osd_trans_exec_op(env, handle, OSD_OT_XATTR_SET);
3108
3109                 rc = __osd_xattr_set(info, inode, XATTR_NAME_LMA,
3110                                      lma, sizeof(*lma), XATTR_REPLACE);
3111                 if (rc != 0) {
3112                         struct osd_device *osd = osd_obj2dev(obj);
3113
3114                         CWARN("%s: set "DFID" lma flags %u failed: rc = %d\n",
3115                               osd_name(osd), PFID(lu_object_fid(&dt->do_lu)),
3116                               lma->lma_incompat, rc);
3117                 } else {
3118                         obj->oo_lma_flags =
3119                                 attr->la_flags & LUSTRE_LMA_FL_MASKS;
3120                 }
3121                 osd_trans_exec_check(env, handle, OSD_OT_XATTR_SET);
3122         }
3123 out:
3124
3125         return rc;
3126 }
3127
3128 static struct dentry *osd_child_dentry_get(const struct lu_env *env,
3129                                            struct osd_object *obj,
3130                                            const char *name, const int namelen)
3131 {
3132         return osd_child_dentry_by_inode(env, obj->oo_inode, name, namelen);
3133 }
3134
3135 static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj,
3136                       umode_t mode, struct dt_allocation_hint *hint,
3137                       struct thandle *th, struct lu_attr *attr)
3138 {
3139         int result;
3140         struct osd_device *osd = osd_obj2dev(obj);
3141         struct osd_thandle *oth;
3142         struct dt_object *parent = NULL;
3143         struct inode *inode;
3144         uid_t owner[2] = {0, 0};
3145
3146         if (attr->la_valid & LA_UID)
3147                 owner[0] = attr->la_uid;
3148         if (attr->la_valid & LA_GID)
3149                 owner[1] = attr->la_gid;
3150
3151         LINVRNT(osd_invariant(obj));
3152         LASSERT(obj->oo_inode == NULL);
3153         LASSERT(obj->oo_hl_head == NULL);
3154
3155         if (S_ISDIR(mode) && ldiskfs_pdo) {
3156                 obj->oo_hl_head =
3157                         ldiskfs_htree_lock_head_alloc(HTREE_HBITS_DEF);
3158                 if (obj->oo_hl_head == NULL)
3159                         return -ENOMEM;
3160         }
3161
3162         oth = container_of(th, struct osd_thandle, ot_super);
3163         LASSERT(oth->ot_handle->h_transaction != NULL);
3164
3165         if (hint != NULL && hint->dah_parent != NULL &&
3166             !dt_object_remote(hint->dah_parent))
3167                 parent = hint->dah_parent;
3168
3169         inode = ldiskfs_create_inode(oth->ot_handle,
3170                                      parent ? osd_dt_obj(parent)->oo_inode :
3171                                               osd_sb(osd)->s_root->d_inode,
3172                                      mode, owner);
3173         if (!IS_ERR(inode)) {
3174                 /* Do not update file c/mtime in ldiskfs. */
3175                 inode->i_flags |= S_NOCMTIME;
3176
3177                 /*
3178                  * For new created object, it must be consistent,
3179                  * and it is unnecessary to scrub against it.
3180                  */
3181                 ldiskfs_set_inode_state(inode, LDISKFS_STATE_LUSTRE_NOSCRUB);
3182
3183                 obj->oo_inode = inode;
3184                 result = 0;
3185         } else {
3186                 if (obj->oo_hl_head != NULL) {
3187                         ldiskfs_htree_lock_head_free(obj->oo_hl_head);
3188                         obj->oo_hl_head = NULL;
3189                 }
3190                 result = PTR_ERR(inode);
3191         }
3192         LINVRNT(osd_invariant(obj));
3193         return result;
3194 }
3195
3196 enum {
3197         OSD_NAME_LEN = 255
3198 };
3199
3200 static int osd_mkdir(struct osd_thread_info *info, struct osd_object *obj,
3201                      struct lu_attr *attr,
3202                      struct dt_allocation_hint *hint,
3203                      struct dt_object_format *dof,
3204                      struct thandle *th)
3205 {
3206         int result;
3207         struct osd_thandle *oth;
3208         __u32 mode = (attr->la_mode & (S_IFMT | S_IRWXUGO | S_ISVTX | S_ISGID));
3209
3210         LASSERT(S_ISDIR(attr->la_mode));
3211
3212         oth = container_of(th, struct osd_thandle, ot_super);
3213         LASSERT(oth->ot_handle->h_transaction != NULL);
3214         if (fid_is_namespace_visible(lu_object_fid(&obj->oo_dt.do_lu)))
3215                 obj->oo_dirent_count = 0;
3216         result = osd_mkfile(info, obj, mode, hint, th, attr);
3217
3218         return result;
3219 }
3220
3221 static int osd_mk_index(struct osd_thread_info *info, struct osd_object *obj,
3222                         struct lu_attr *attr,
3223                         struct dt_allocation_hint *hint,
3224                         struct dt_object_format *dof,
3225                         struct thandle *th)
3226 {
3227         int result;
3228         struct osd_thandle *oth;
3229         const struct dt_index_features *feat = dof->u.dof_idx.di_feat;
3230
3231         __u32 mode = (attr->la_mode & (S_IFMT | S_IALLUGO | S_ISVTX));
3232
3233         LASSERT(S_ISREG(attr->la_mode));
3234
3235         oth = container_of(th, struct osd_thandle, ot_super);
3236         LASSERT(oth->ot_handle->h_transaction != NULL);
3237
3238         result = osd_mkfile(info, obj, mode, hint, th, attr);
3239         if (result == 0) {
3240                 LASSERT(obj->oo_inode != NULL);
3241                 if (feat->dif_flags & DT_IND_VARKEY)
3242                         result = iam_lvar_create(obj->oo_inode,
3243                                                  feat->dif_keysize_max,
3244                                                  feat->dif_ptrsize,
3245                                                  feat->dif_recsize_max,
3246                                                  oth->ot_handle);
3247                 else
3248                         result = iam_lfix_create(obj->oo_inode,
3249                                                  feat->dif_keysize_max,
3250                                                  feat->dif_ptrsize,
3251                                                  feat->dif_recsize_max,
3252                                                  oth->ot_handle);
3253         }
3254         return result;
3255 }
3256
3257 static int osd_mkreg(struct osd_thread_info *info, struct osd_object *obj,
3258                      struct lu_attr *attr,
3259                      struct dt_allocation_hint *hint,
3260                      struct dt_object_format *dof,
3261                      struct thandle *th)
3262 {
3263         LASSERT(S_ISREG(attr->la_mode));
3264         return osd_mkfile(info, obj, (attr->la_mode &
3265                          (S_IFMT | S_IALLUGO | S_ISVTX)), hint, th,
3266                           attr);
3267 }
3268
3269 static int osd_mksym(struct osd_thread_info *info, struct osd_object *obj,
3270                      struct lu_attr *attr,
3271                      struct dt_allocation_hint *hint,
3272                      struct dt_object_format *dof,
3273                      struct thandle *th)
3274 {
3275         LASSERT(S_ISLNK(attr->la_mode));
3276         return osd_mkfile(info, obj, (attr->la_mode &
3277                          (S_IFMT | S_IALLUGO | S_ISVTX)), hint, th,
3278                           attr);
3279 }
3280
3281 static int osd_mknod(struct osd_thread_info *info, struct osd_object *obj,
3282                      struct lu_attr *attr,
3283                      struct dt_allocation_hint *hint,
3284                      struct dt_object_format *dof,
3285                      struct thandle *th)
3286 {
3287         umode_t mode = attr->la_mode & (S_IFMT | S_IALLUGO | S_ISVTX);
3288         int result;
3289
3290         LINVRNT(osd_invariant(obj));
3291         LASSERT(obj->oo_inode == NULL);
3292         LASSERT(S_ISCHR(mode) || S_ISBLK(mode) ||
3293                 S_ISFIFO(mode) || S_ISSOCK(mode));
3294
3295         result = osd_mkfile(info, obj, mode, hint, th, attr);
3296         if (result == 0) {
3297                 LASSERT(obj->oo_inode != NULL);
3298                 /*
3299                  * This inode should be marked dirty for i_rdev.  Currently
3300                  * that is done in the osd_attr_init().
3301                  */
3302                 init_special_inode(obj->oo_inode, obj->oo_inode->i_mode,
3303                                    attr->la_rdev);
3304         }
3305         LINVRNT(osd_invariant(obj));
3306         return result;
3307 }
3308
3309 typedef int (*osd_obj_type_f)(struct osd_thread_info *, struct osd_object *,
3310                               struct lu_attr *,
3311                               struct dt_allocation_hint *hint,
3312                               struct dt_object_format *dof,
3313                               struct thandle *);
3314
3315 static osd_obj_type_f osd_create_type_f(enum dt_format_type type)
3316 {
3317         osd_obj_type_f result;
3318
3319         switch (type) {
3320         case DFT_DIR:
3321                 result = osd_mkdir;
3322                 break;
3323         case DFT_REGULAR:
3324                 result = osd_mkreg;
3325                 break;
3326         case DFT_SYM:
3327                 result = osd_mksym;
3328                 break;
3329         case DFT_NODE:
3330                 result = osd_mknod;
3331                 break;
3332         case DFT_INDEX:
3333                 result = osd_mk_index;
3334                 break;
3335
3336         default:
3337                 LBUG();
3338                 break;
3339         }
3340         return result;
3341 }
3342
3343
3344 static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah,
3345                         struct dt_object *parent, struct dt_object *child,
3346                         umode_t child_mode)
3347 {
3348         LASSERT(ah);
3349
3350         ah->dah_parent = parent;
3351         ah->dah_mode = child_mode;
3352
3353         if (parent != NULL && !dt_object_remote(parent)) {
3354                 /* will help to find FID->ino at dt_insert("..") */
3355                 struct osd_object *pobj = osd_dt_obj(parent);
3356
3357                 osd_idc_find_and_init(env, osd_obj2dev(pobj), pobj);
3358         }
3359 }
3360
3361 static void osd_attr_init(struct osd_thread_info *info, struct osd_object *obj,
3362                           struct lu_attr *attr, struct dt_object_format *dof,
3363                           struct thandle *handle)
3364 {
3365         struct inode *inode = obj->oo_inode;
3366         __u64 valid = attr->la_valid;
3367         int result;
3368
3369         attr->la_valid &= ~(LA_TYPE | LA_MODE);
3370
3371         if (dof->dof_type != DFT_NODE)
3372                 attr->la_valid &= ~LA_RDEV;
3373         if ((valid & LA_ATIME) && (attr->la_atime == inode->i_atime.tv_sec))
3374                 attr->la_valid &= ~LA_ATIME;
3375         if ((valid & LA_CTIME) && (attr->la_ctime == inode->i_ctime.tv_sec))
3376                 attr->la_valid &= ~LA_CTIME;
3377         if ((valid & LA_MTIME) && (attr->la_mtime == inode->i_mtime.tv_sec))
3378                 attr->la_valid &= ~LA_MTIME;
3379
3380         result = osd_quota_transfer(inode, attr, handle);
3381         if (result)
3382                 return;
3383
3384         if (attr->la_valid != 0) {
3385                 result = osd_inode_setattr(info->oti_env, inode, attr);
3386                 /*
3387                  * The osd_inode_setattr() should always succeed here.  The
3388                  * only error that could be returned is EDQUOT when we are
3389                  * trying to change the UID or GID of the inode. However, this
3390                  * should not happen since quota enforcement is no longer
3391                  * enabled on ldiskfs (lquota takes care of it).
3392                  */
3393                 LASSERTF(result == 0, "%d\n", result);
3394                 osd_dirty_inode(inode, I_DIRTY_DATASYNC);
3395         }
3396
3397         attr->la_valid = valid;
3398 }
3399
3400 /**
3401  * Helper function for osd_create()
3402  *
3403  * \retval 0, on success
3404  */
3405 static int __osd_create(struct osd_thread_info *info, struct osd_object *obj,
3406                         struct lu_attr *attr, struct dt_allocation_hint *hint,
3407                         struct dt_object_format *dof, struct thandle *th)
3408 {
3409         int result;
3410         __u32 umask;
3411
3412         osd_trans_exec_op(info->oti_env, th, OSD_OT_CREATE);
3413
3414         /* we drop umask so that permissions we pass are not affected */
3415         umask = current->fs->umask;
3416         current->fs->umask = 0;
3417
3418         result = osd_create_type_f(dof->dof_type)(info, obj, attr, hint, dof,
3419                                                   th);
3420         if (likely(obj->oo_inode != NULL)) {
3421                 LASSERT(obj->oo_inode->i_state & I_NEW);
3422
3423                 /*
3424                  * Unlock the inode before attr initialization to avoid
3425                  * unnecessary dqget operations. LU-6378
3426                  */
3427                 unlock_new_inode(obj->oo_inode);
3428         }
3429
3430         if (likely(result == 0)) {
3431                 osd_attr_init(info, obj, attr, dof, th);
3432                 osd_object_init0(obj);
3433         }
3434
3435         /* restore previous umask value */
3436         current->fs->umask = umask;
3437
3438         osd_trans_exec_check(info->oti_env, th, OSD_OT_CREATE);
3439
3440         return result;
3441 }
3442
3443 /**
3444  * Helper function for osd_create()
3445  *
3446  * \retval 0, on success
3447  */
3448 static int __osd_oi_insert(const struct lu_env *env, struct osd_object *obj,
3449                            const struct lu_fid *fid, struct thandle *th)
3450 {
3451         struct osd_thread_info *info = osd_oti_get(env);
3452         struct osd_inode_id    *id   = &info->oti_id;
3453         struct osd_device      *osd  = osd_obj2dev(obj);
3454         struct osd_thandle     *oh;
3455         int rc;
3456
3457         LASSERT(obj->oo_inode != NULL);
3458
3459         oh = container_of(th, struct osd_thandle, ot_super);
3460         LASSERT(oh->ot_handle);
3461         osd_trans_exec_op(env, th, OSD_OT_INSERT);
3462
3463         osd_id_gen(id, obj->oo_inode->i_ino, obj->oo_inode->i_generation);
3464         rc = osd_oi_insert(info, osd, fid, id, oh->ot_handle,
3465                            OI_CHECK_FLD, NULL);
3466         if (CFS_FAIL_CHECK(OBD_FAIL_OSD_DUPLICATE_MAP) && osd->od_is_ost) {
3467                 struct lu_fid next_fid = *fid;
3468
3469                 /* insert next object in advance, and map to the same inode */
3470                 next_fid.f_oid++;
3471                 if (next_fid.f_oid != 0) {
3472                         osd_trans_exec_op(env, th, OSD_OT_INSERT);
3473                         osd_oi_insert(info, osd, &next_fid, id, oh->ot_handle,
3474                                       OI_CHECK_FLD, NULL);
3475                         osd_trans_exec_check(env, th, OSD_OT_INSERT);
3476                 }
3477         }
3478
3479         osd_trans_exec_check(env, th, OSD_OT_INSERT);
3480
3481         return rc;
3482 }
3483
3484 int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd,
3485                    u64 seq, struct lu_seq_range *range)
3486 {
3487         struct seq_server_site *ss = osd_seq_site(osd);
3488
3489         if (fid_seq_is_idif(seq)) {
3490                 fld_range_set_ost(range);
3491                 range->lsr_index = idif_ost_idx(seq);
3492                 return 0;
3493         }
3494
3495         if (!fid_seq_in_fldb(seq)) {
3496                 fld_range_set_mdt(range);
3497                 if (ss != NULL)
3498                         /*
3499                          * FIXME: If ss is NULL, it suppose not get lsr_index
3500                          * at all
3501                          */
3502                         range->lsr_index = ss->ss_node_id;
3503                 return 0;
3504         }
3505
3506         LASSERT(ss != NULL);
3507         fld_range_set_any(range);
3508         /* OSD will only do local fld lookup */
3509         return fld_local_lookup(env, ss->ss_server_fld, seq, range);
3510 }
3511
3512 static int osd_declare_create(const struct lu_env *env, struct dt_object *dt,
3513                               struct lu_attr *attr,
3514                               struct dt_allocation_hint *hint,
3515                               struct dt_object_format *dof,
3516                               struct thandle *handle)
3517 {
3518         struct osd_thandle *oh;
3519         int rc;
3520
3521         ENTRY;
3522
3523         LASSERT(handle != NULL);
3524
3525         oh = container_of(handle, struct osd_thandle, ot_super);
3526         LASSERT(oh->ot_handle == NULL);
3527
3528         /*
3529          * EA object consumes more credits than regular object: osd_mk_index
3530          * vs. osd_mkreg: osd_mk_index will create 2 blocks for root_node and
3531          * leaf_node, could involves the block, block bitmap, groups, GDT
3532          * change for each block, so add 4 * 2 credits in that case.
3533          */
3534         osd_trans_declare_op(env, oh, OSD_OT_CREATE,
3535                              osd_dto_credits_noquota[DTO_OBJECT_CREATE] +
3536                              (dof->dof_type == DFT_INDEX) ? 4 * 2 : 0);
3537         /*
3538          * Reuse idle OI block may cause additional one OI block
3539          * to be changed.
3540          */
3541         osd_trans_declare_op(env, oh, OSD_OT_INSERT,
3542                              osd_dto_credits_noquota[DTO_INDEX_INSERT] + 1);
3543         if (CFS_FAIL_CHECK(OBD_FAIL_OSD_DUPLICATE_MAP))
3544                 osd_trans_declare_op(env, oh, OSD_OT_INSERT,
3545                              osd_dto_credits_noquota[DTO_INDEX_INSERT] + 1);
3546
3547         /* will help to find FID->ino mapping at dt_insert() */
3548         rc = osd_idc_find_and_init(env, osd_obj2dev(osd_dt_obj(dt)),
3549                                    osd_dt_obj(dt));
3550         if (rc != 0)
3551                 RETURN(rc);
3552
3553         if (!attr)
3554                 RETURN(0);
3555
3556         rc = osd_declare_inode_qid(env, attr->la_uid, attr->la_gid,
3557                                    attr->la_projid, 1, oh, osd_dt_obj(dt),
3558                                    NULL, OSD_QID_INODE);
3559         if (rc != 0)
3560                 RETURN(rc);
3561
3562         RETURN(rc);
3563 }
3564
3565 /**
3566  * Called to destroy on-disk representation of the object
3567  *
3568  * Concurrency: must be locked
3569  */
3570 static int osd_declare_destroy(const struct lu_env *env, struct dt_object *dt,
3571                                struct thandle *th)
3572 {
3573         struct osd_object *obj = osd_dt_obj(dt);
3574         struct inode *inode = obj->oo_inode;
3575         struct osd_thandle *oh;
3576         int rc;
3577
3578         ENTRY;
3579
3580         if (inode == NULL)
3581                 RETURN(-ENOENT);
3582
3583         oh = container_of(th, struct osd_thandle, ot_super);
3584         LASSERT(oh->ot_handle == NULL);
3585
3586         osd_trans_declare_op(env, oh, OSD_OT_DESTROY,
3587                              osd_dto_credits_noquota[DTO_OBJECT_DELETE]);
3588
3589         /* For removing agent entry */
3590         if (lu_object_has_agent_entry(&obj->oo_dt.do_lu))
3591                 oh->ot_credits += osd_dto_credits_noquota[DTO_INDEX_DELETE];
3592
3593         /*
3594          * Recycle idle OI leaf may cause additional three OI blocks
3595          * to be changed.
3596          */
3597         if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
3598                 osd_trans_declare_op(env, oh, OSD_OT_DELETE,
3599                              osd_dto_credits_noquota[DTO_INDEX_DELETE] + 3);
3600         /* one less inode */
3601         rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
3602                                    i_projid_read(inode), -1, oh, obj, NULL,
3603                                    OSD_QID_INODE);
3604         if (rc)
3605                 RETURN(rc);
3606         /* data to be truncated */
3607         rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
3608                                    i_projid_read(inode), 0, oh, obj, NULL,
3609                                    OSD_QID_BLK);
3610         if (rc)
3611                 RETURN(rc);
3612
3613         /*
3614          * will help to find FID->ino when this object is being
3615          * added to PENDING
3616          */
3617         rc = osd_idc_find_and_init(env, osd_obj2dev(obj), obj);
3618
3619         RETURN(rc);
3620 }
3621
3622 static int osd_destroy(const struct lu_env *env, struct dt_object *dt,
3623                        struct thandle *th)
3624 {
3625         const struct lu_fid *fid = lu_object_fid(&dt->do_lu);
3626         struct osd_object *obj = osd_dt_obj(dt);
3627         struct inode *inode = obj->oo_inode;
3628         struct osd_device *osd = osd_obj2dev(obj);
3629         struct osd_thandle *oh;
3630         int result;
3631
3632         ENTRY;
3633
3634         oh = container_of(th, struct osd_thandle, ot_super);
3635         LASSERT(oh->ot_handle);
3636         LASSERT(inode);
3637         LASSERT(!lu_object_is_dying(dt->do_lu.lo_header));
3638
3639         if (unlikely(fid_is_acct(fid)))
3640                 RETURN(-EPERM);
3641
3642         if (lu_object_has_agent_entry(&obj->oo_dt.do_lu)) {
3643                 result = osd_delete_from_remote_parent(env, osd, obj, oh, true);
3644                 if (result != 0)
3645                         CERROR("%s: remove agent entry "DFID": rc = %d\n",
3646                                osd_name(osd), PFID(fid), result);
3647         }
3648
3649         if (S_ISDIR(inode->i_mode)) {
3650                 if (inode->i_nlink > 2)
3651                         CERROR("%s: directory "DFID" ino %lu link count is %u at unlink. run e2fsck to repair\n",
3652                                osd_name(osd), PFID(fid), inode->i_ino,
3653                                inode->i_nlink);
3654
3655                 spin_lock(&obj->oo_guard);
3656                 clear_nlink(inode);
3657                 spin_unlock(&obj->oo_guard);
3658                 osd_dirty_inode(inode, I_DIRTY_DATASYNC);
3659         }
3660
3661         osd_trans_exec_op(env, th, OSD_OT_DESTROY);
3662
3663         ldiskfs_set_inode_state(inode, LDISKFS_STATE_LUSTRE_DESTROY);
3664
3665         if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LOST_MDTOBJ2))
3666                 result = osd_oi_delete(osd_oti_get(env), osd, fid,
3667                                        oh->ot_handle, OI_CHECK_FLD);
3668
3669         osd_trans_exec_check(env, th, OSD_OT_DESTROY);
3670         /* XXX: add to ext3 orphan list */
3671         /* rc = ext3_orphan_add(handle_t *handle, struct inode *inode) */
3672
3673         /* not needed in the cache anymore */
3674         set_bit(LU_OBJECT_HEARD_BANSHEE, &dt->do_lu.lo_header->loh_flags);
3675         obj->oo_destroyed = 1;
3676
3677    &nbs