Whamcloud - gitweb
LU-6142 lustre: use list_first/last_entry() for list heads
[fs/lustre-release.git] / lustre / osd-zfs / osd_object.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2012, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  *
31  * lustre/osd-zfs/osd_object.c
32  *
33  * Author: Alex Zhuravlev <bzzz@whamcloud.com>
34  * Author: Mike Pershin <tappro@whamcloud.com>
35  * Author: Johann Lombardi <johann@whamcloud.com>
36  */
37
38 #define DEBUG_SUBSYSTEM S_OSD
39
40 #include <libcfs/libcfs.h>
41 #include <obd_support.h>
42 #include <lustre_net.h>
43 #include <obd.h>
44 #include <obd_class.h>
45 #include <lustre_disk.h>
46 #include <lustre_fid.h>
47
48 #include "osd_internal.h"
49
50 #include <sys/dnode.h>
51 #include <sys/dbuf.h>
52 #include <sys/spa.h>
53 #include <sys/stat.h>
54 #include <sys/zap.h>
55 #include <sys/spa_impl.h>
56 #include <sys/zfs_znode.h>
57 #include <sys/dmu_tx.h>
58 #include <sys/dmu_objset.h>
59 #include <sys/dsl_prop.h>
60 #include <sys/sa_impl.h>
61 #include <sys/txg.h>
62
63 char *osd_obj_tag = "osd_object";
64 static int osd_object_sync_delay_us = -1;
65
66 static const struct dt_object_operations osd_obj_ops;
67 static const struct lu_object_operations osd_lu_obj_ops;
68 static const struct dt_object_operations osd_obj_otable_it_ops;
69
70 static void
71 osd_object_sa_fini(struct osd_object *obj)
72 {
73         if (obj->oo_sa_hdl) {
74                 sa_handle_destroy(obj->oo_sa_hdl);
75                 obj->oo_sa_hdl = NULL;
76         }
77 }
78
79 static int
80 osd_object_sa_init(struct osd_object *obj, struct osd_device *o)
81 {
82         int rc;
83
84         LASSERT(obj->oo_sa_hdl == NULL);
85         LASSERT(obj->oo_dn != NULL);
86
87         rc = osd_sa_handle_get(obj);
88         if (rc)
89                 return rc;
90
91         /* Cache the xattr object id, valid for the life of the object */
92         rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_XATTR(o), &obj->oo_xattr, 8);
93         if (rc == -ENOENT) {
94                 obj->oo_xattr = ZFS_NO_OBJECT;
95                 rc = 0;
96         } else if (rc) {
97                 osd_object_sa_fini(obj);
98         }
99
100         return rc;
101 }
102
103 /*
104  * Add object to list of dirty objects in tx handle.
105  */
106 void osd_object_sa_dirty_add(struct osd_object *obj, struct osd_thandle *oh)
107 {
108         if (!list_empty(&obj->oo_sa_linkage))
109                 return;
110
111         write_lock(&obj->oo_attr_lock);
112         if (likely(list_empty(&obj->oo_sa_linkage)))
113                 list_add(&obj->oo_sa_linkage, &oh->ot_sa_list);
114         write_unlock(&obj->oo_attr_lock);
115 }
116
117 /*
118  * Release spill block dbuf hold for all dirty SAs.
119  */
120 void osd_object_sa_dirty_rele(const struct lu_env *env, struct osd_thandle *oh)
121 {
122         struct osd_object *obj;
123
124         while (!list_empty(&oh->ot_sa_list)) {
125                 obj = list_first_entry(&oh->ot_sa_list,
126                                        struct osd_object, oo_sa_linkage);
127                 write_lock(&obj->oo_attr_lock);
128                 list_del_init(&obj->oo_sa_linkage);
129                 write_unlock(&obj->oo_attr_lock);
130                 if (obj->oo_late_xattr && obj->oo_destroyed == 0) {
131                         /*
132                          * take oo_guard to protect oo_sa_xattr buffer
133                          * from concurrent update by osd_xattr_set()
134                          */
135                         LASSERT(oh->ot_assigned != 0);
136                         down_write(&obj->oo_guard);
137                         if (obj->oo_destroyed == 0) {
138                                 if (obj->oo_late_attr_set)
139                                         __osd_sa_attr_init(env, obj, oh);
140                                 else if (obj->oo_late_xattr)
141                                         __osd_sa_xattr_update(env, obj, oh);
142                         }
143                         up_write(&obj->oo_guard);
144                 }
145                 sa_spill_rele(obj->oo_sa_hdl);
146         }
147 }
148
149 /*
150  * Update the SA and add the object to the dirty list.
151  */
152 int osd_object_sa_update(struct osd_object *obj, sa_attr_type_t type,
153                          void *buf, uint32_t buflen, struct osd_thandle *oh)
154 {
155         int rc;
156
157         LASSERT(obj->oo_sa_hdl != NULL);
158         LASSERT(oh->ot_tx != NULL);
159
160         rc = -sa_update(obj->oo_sa_hdl, type, buf, buflen, oh->ot_tx);
161         osd_object_sa_dirty_add(obj, oh);
162
163         return rc;
164 }
165
166 /*
167  * Bulk update the SA and add the object to the dirty list.
168  */
169 static int
170 osd_object_sa_bulk_update(struct osd_object *obj, sa_bulk_attr_t *attrs,
171                           int count, struct osd_thandle *oh)
172 {
173         int rc;
174
175         LASSERT(obj->oo_sa_hdl != NULL);
176         LASSERT(oh->ot_tx != NULL);
177
178         rc = -sa_bulk_update(obj->oo_sa_hdl, attrs, count, oh->ot_tx);
179         osd_object_sa_dirty_add(obj, oh);
180
181         return rc;
182 }
183
184 /*
185  * Retrieve the attributes of a DMU object
186  */
187 static int __osd_object_attr_get(const struct lu_env *env, struct osd_device *o,
188                                  struct osd_object *obj, struct lu_attr *la)
189 {
190         struct osa_attr *osa = &osd_oti_get(env)->oti_osa;
191         sa_bulk_attr_t *bulk = osd_oti_get(env)->oti_attr_bulk;
192         struct lustre_mdt_attrs *lma;
193         struct lu_buf buf;
194         int cnt = 0;
195         int              rc;
196         ENTRY;
197
198         LASSERT(obj->oo_dn != NULL);
199
200         la->la_valid |= LA_ATIME | LA_MTIME | LA_CTIME | LA_BTIME | LA_MODE |
201                         LA_TYPE | LA_SIZE | LA_UID | LA_GID | LA_FLAGS |
202                         LA_NLINK;
203
204         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(o), NULL, osa->atime, 16);
205         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(o), NULL, osa->mtime, 16);
206         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(o), NULL, osa->ctime, 16);
207         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CRTIME(o), NULL, osa->btime, 16);
208         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(o), NULL, &osa->mode, 8);
209         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(o), NULL, &osa->size, 8);
210         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(o), NULL, &osa->nlink, 8);
211         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(o), NULL, &osa->uid, 8);
212         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(o), NULL, &osa->gid, 8);
213         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(o), NULL, &osa->flags, 8);
214         LASSERT(cnt <= ARRAY_SIZE(osd_oti_get(env)->oti_attr_bulk));
215
216         rc = -sa_bulk_lookup(obj->oo_sa_hdl, bulk, cnt);
217         if (rc)
218                 GOTO(out_sa, rc);
219
220 #ifdef ZFS_PROJINHERIT
221         if (o->od_projectused_dn && osa->flags & ZFS_PROJID) {
222                 rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_PROJID(o),
223                                 &osa->projid, 8);
224                 if (rc)
225                         GOTO(out_sa, rc);
226
227                 la->la_projid = osa->projid;
228                 la->la_valid |= LA_PROJID;
229                 obj->oo_with_projid = 1;
230         } else {
231                 la->la_projid = ZFS_DEFAULT_PROJID;
232                 la->la_valid &= ~LA_PROJID;
233         }
234 #else
235         la->la_projid = 0;
236         la->la_valid &= ~LA_PROJID;
237 #endif
238
239         la->la_atime = osa->atime[0];
240         la->la_mtime = osa->mtime[0];
241         la->la_ctime = osa->ctime[0];
242         la->la_btime = osa->btime[0];
243         la->la_mode = osa->mode;
244         la->la_uid = osa->uid;
245         la->la_gid = osa->gid;
246         la->la_nlink = osa->nlink;
247         la->la_flags = attrs_zfs2fs(osa->flags);
248         la->la_size = osa->size;
249
250         /* Try to get extra flags from LMA */
251         lma = (struct lustre_mdt_attrs *)osd_oti_get(env)->oti_buf;
252         buf.lb_buf = lma;
253         buf.lb_len = sizeof(osd_oti_get(env)->oti_buf);
254         down_read(&obj->oo_guard);
255         rc = osd_xattr_get_lma(env, obj, &buf);
256         if (!rc) {
257                 lma->lma_incompat = le32_to_cpu(lma->lma_incompat);
258                 obj->oo_lma_flags =
259                         lma_to_lustre_flags(lma->lma_incompat);
260         } else if (rc == -ENODATA ||
261                    !(S_ISDIR(la->la_mode) &&
262                      dt_object_exists(&obj->oo_dt))) {
263                 rc = 0;
264         }
265         up_read(&obj->oo_guard);
266
267         if (S_ISCHR(la->la_mode) || S_ISBLK(la->la_mode)) {
268                 rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_RDEV(o), &osa->rdev, 8);
269                 if (rc)
270                         GOTO(out_sa, rc);
271                 la->la_rdev = osa->rdev;
272                 la->la_valid |= LA_RDEV;
273         }
274 out_sa:
275
276         RETURN(rc);
277 }
278
279 int __osd_obj2dnode(objset_t *os, uint64_t oid, dnode_t **dnp)
280 {
281         dmu_buf_t *db;
282         dmu_buf_impl_t *dbi;
283         int rc;
284
285         rc = -dmu_bonus_hold(os, oid, osd_obj_tag, &db);
286         if (rc)
287                 return rc;
288
289         dbi = (dmu_buf_impl_t *)db;
290         DB_DNODE_ENTER(dbi);
291         *dnp = DB_DNODE(dbi);
292         DB_DNODE_EXIT(dbi);
293         LASSERT(*dnp != NULL);
294
295         return 0;
296 }
297
298 /*
299  * Concurrency: no concurrent access is possible that early in object
300  * life-cycle.
301  */
302 struct lu_object *osd_object_alloc(const struct lu_env *env,
303                                    const struct lu_object_header *hdr,
304                                    struct lu_device *d)
305 {
306         struct osd_object *mo;
307
308         OBD_SLAB_ALLOC_PTR_GFP(mo, osd_object_kmem, GFP_NOFS);
309         if (mo != NULL) {
310                 struct lu_object *l;
311                 struct lu_object_header *h;
312                 struct osd_device *o = osd_dev(d);
313                 int i;
314
315                 l = &mo->oo_dt.do_lu;
316                 if (unlikely(o->od_in_init)) {
317                         OBD_ALLOC_PTR(h);
318                         if (!h) {
319                                 OBD_FREE_PTR(mo);
320                                 return NULL;
321                         }
322
323                         lu_object_header_init(h);
324                         lu_object_init(l, h, d);
325                         lu_object_add_top(h, l);
326                         mo->oo_header = h;
327                 } else {
328                         dt_object_init(&mo->oo_dt, NULL, d);
329                         mo->oo_header = NULL;
330                 }
331
332                 mo->oo_dt.do_ops = &osd_obj_ops;
333                 l->lo_ops = &osd_lu_obj_ops;
334                 INIT_LIST_HEAD(&mo->oo_sa_linkage);
335                 INIT_LIST_HEAD(&mo->oo_unlinked_linkage);
336                 init_rwsem(&mo->oo_sem);
337                 init_rwsem(&mo->oo_guard);
338                 rwlock_init(&mo->oo_attr_lock);
339                 mo->oo_destroy = OSD_DESTROY_NONE;
340                 for (i = 0; i < OSD_MAX_DBUFS; i++)
341                         mo->oo_dbs[i] = NULL;
342                 return l;
343         } else {
344                 return NULL;
345         }
346 }
347
348 static void osd_obj_set_blksize(const struct lu_env *env,
349                                 struct osd_device *osd, struct osd_object *obj)
350 {
351         const struct lu_fid *fid = lu_object_fid(&obj->oo_dt.do_lu);
352         dmu_tx_t *tx;
353         dnode_t *dn = obj->oo_dn;
354         uint32_t blksz;
355         int rc = 0;
356         ENTRY;
357
358         LASSERT(!osd_oti_get(env)->oti_in_trans);
359
360         tx = dmu_tx_create(osd->od_os);
361         if (!tx) {
362                 CERROR("%s: fail to create tx to set blksize for "DFID"\n",
363                        osd->od_svname, PFID(fid));
364                 RETURN_EXIT;
365         }
366
367         dmu_tx_hold_bonus(tx, dn->dn_object);
368         rc = -dmu_tx_assign(tx, TXG_WAIT);
369         if (rc) {
370                 dmu_tx_abort(tx);
371                 CERROR("%s: fail to assign tx to set blksize for "DFID
372                        ": rc = %d\n", osd->od_svname, PFID(fid), rc);
373                 RETURN_EXIT;
374         }
375
376         down_write(&obj->oo_guard);
377         if (unlikely((1 << dn->dn_datablkshift) >= PAGE_SIZE))
378                 GOTO(out, rc = 1);
379
380         blksz = dn->dn_datablksz;
381         if (!is_power_of_2(blksz))
382                 blksz = size_roundup_power2(blksz);
383
384         if (blksz > osd->od_max_blksz)
385                 blksz = osd->od_max_blksz;
386         else if (blksz < PAGE_SIZE)
387                 blksz = PAGE_SIZE;
388         rc = -dmu_object_set_blocksize(osd->od_os, dn->dn_object, blksz, 0, tx);
389
390         GOTO(out, rc);
391
392 out:
393         up_write(&obj->oo_guard);
394         if (rc) {
395                 dmu_tx_abort(tx);
396                 if (unlikely(obj->oo_dn->dn_maxblkid > 0))
397                         rc = 1;
398                 if (rc < 0)
399                         CERROR("%s: fail to set blksize for "DFID": rc = %d\n",
400                                osd->od_svname, PFID(fid), rc);
401         } else {
402                 dmu_tx_commit(tx);
403                 CDEBUG(D_INODE, "%s: set blksize as %u for "DFID"\n",
404                        osd->od_svname, blksz, PFID(fid));
405         }
406 }
407
408 /*
409  * Concurrency: shouldn't matter.
410  */
411 static int osd_object_init0(const struct lu_env *env, struct osd_object *obj)
412 {
413         struct osd_device       *osd = osd_obj2dev(obj);
414         const struct lu_fid     *fid = lu_object_fid(&obj->oo_dt.do_lu);
415         int                      rc = 0;
416         ENTRY;
417
418         LASSERT(obj->oo_dn);
419
420         rc = osd_object_sa_init(obj, osd);
421         if (rc)
422                 RETURN(rc);
423
424         /* cache attrs in object */
425         rc = __osd_object_attr_get(env, osd, obj, &obj->oo_attr);
426         if (rc)
427                 RETURN(rc);
428
429         if (likely(!fid_is_acct(fid))) {
430                 /* no body operations for accounting objects */
431                 obj->oo_dt.do_body_ops = &osd_body_ops;
432
433                 if (S_ISREG(obj->oo_attr.la_mode) &&
434                     obj->oo_dn->dn_maxblkid == 0 &&
435                     (1 << obj->oo_dn->dn_datablkshift) < PAGE_SIZE &&
436                     (fid_is_idif(fid) || fid_is_norm(fid) ||
437                      fid_is_echo(fid)) &&
438                     osd->od_is_ost && !osd->od_dt_dev.dd_rdonly)
439                         osd_obj_set_blksize(env, osd, obj);
440         }
441
442         /*
443          * initialize object before marking it existing
444          */
445         obj->oo_dt.do_lu.lo_header->loh_attr |= obj->oo_attr.la_mode & S_IFMT;
446
447         smp_mb();
448         obj->oo_dt.do_lu.lo_header->loh_attr |= LOHA_EXISTS;
449
450         RETURN(0);
451 }
452
453 static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
454 {
455         struct osd_thread_info  *info = osd_oti_get(env);
456         struct lu_buf           buf;
457         int                     rc;
458         struct lustre_mdt_attrs *lma;
459         const struct lu_fid *rfid = lu_object_fid(&obj->oo_dt.do_lu);
460         ENTRY;
461
462         BUILD_BUG_ON(sizeof(info->oti_buf) < sizeof(*lma));
463         lma = (struct lustre_mdt_attrs *)info->oti_buf;
464         buf.lb_buf = lma;
465         buf.lb_len = sizeof(info->oti_buf);
466
467         rc = osd_xattr_get(env, &obj->oo_dt, &buf, XATTR_NAME_LMA);
468         if (rc > 0) {
469                 rc = 0;
470                 lustre_lma_swab(lma);
471                 if (unlikely((lma->lma_incompat & ~LMA_INCOMPAT_SUPP) ||
472                              CFS_FAIL_CHECK(OBD_FAIL_OSD_LMA_INCOMPAT))) {
473                         CWARN("%s: unsupported incompat LMA feature(s) %#x for "
474                               "fid = "DFID"\n", osd_obj2dev(obj)->od_svname,
475                               lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
476                               PFID(rfid));
477                         rc = -EOPNOTSUPP;
478                 } else if (unlikely(!lu_fid_eq(rfid, &lma->lma_self_fid))) {
479                         CERROR("%s: FID-in-LMA "DFID" does not match the "
480                               "object self-fid "DFID"\n",
481                               osd_obj2dev(obj)->od_svname,
482                               PFID(&lma->lma_self_fid), PFID(rfid));
483                         rc = -EREMCHG;
484                 } else {
485                         struct osd_device *osd = osd_obj2dev(obj);
486
487                         if (lma->lma_compat & LMAC_STRIPE_INFO &&
488                             osd->od_is_ost)
489                                 obj->oo_pfid_in_lma = 1;
490                         if (unlikely(lma->lma_incompat & LMAI_REMOTE_PARENT) &&
491                             osd->od_remote_parent_dir != ZFS_NO_OBJECT)
492                                 lu_object_set_agent_entry(&obj->oo_dt.do_lu);
493                 }
494         } else if (rc == -ENODATA) {
495                 /* haven't initialize LMA xattr */
496                 rc = 0;
497         }
498
499         RETURN(rc);
500 }
501
502 /**
503  * Helper function to retrieve DMU object id from fid for accounting object
504  */
505 static dnode_t *osd_quota_fid2dmu(const struct osd_device *osd,
506                                   const struct lu_fid *fid)
507 {
508         dnode_t *dn = NULL;
509
510         LASSERT(fid_is_acct(fid));
511
512         switch (fid_oid(fid)) {
513         case ACCT_USER_OID:
514                 dn = osd->od_userused_dn;
515                 break;
516         case ACCT_GROUP_OID:
517                 dn = osd->od_groupused_dn;
518                 break;
519 #ifdef ZFS_PROJINHERIT
520         case ACCT_PROJECT_OID:
521                 dn = osd->od_projectused_dn;
522                 break;
523 #endif
524         default:
525                 break;
526         }
527
528         return dn;
529 }
530
531 /*
532  * Concurrency: no concurrent access is possible that early in object
533  * life-cycle.
534  */
535 static int osd_object_init(const struct lu_env *env, struct lu_object *l,
536                            const struct lu_object_conf *conf)
537 {
538         struct osd_object *obj = osd_obj(l);
539         struct osd_device *osd = osd_obj2dev(obj);
540         const struct lu_fid *fid = lu_object_fid(l);
541         struct lustre_scrub *scrub = &osd->od_scrub;
542         struct osd_thread_info *info = osd_oti_get(env);
543         struct luz_direntry *zde = &info->oti_zde;
544         struct osd_idmap_cache *idc;
545         char *name = info->oti_str;
546         uint64_t oid;
547         int rc = 0;
548         int rc1;
549         bool remote = false;
550         ENTRY;
551
552         LASSERT(osd_invariant(obj));
553
554         if (fid_is_otable_it(&l->lo_header->loh_fid)) {
555                 obj->oo_dt.do_ops = &osd_obj_otable_it_ops;
556                 l->lo_header->loh_attr |= LOHA_EXISTS;
557
558                 GOTO(out, rc = 0);
559         }
560
561         if (conf && conf->loc_flags & LOC_F_NEW)
562                 GOTO(out, rc = 0);
563
564         if (unlikely(fid_is_acct(fid))) {
565                 obj->oo_dn = osd_quota_fid2dmu(osd, fid);
566                 if (obj->oo_dn) {
567                         obj->oo_dt.do_index_ops = &osd_acct_index_ops;
568                         l->lo_header->loh_attr |= LOHA_EXISTS;
569                 }
570
571                 GOTO(out, rc = 0);
572         }
573
574         idc = osd_idc_find(env, osd, fid);
575         if (idc && !idc->oic_remote && idc->oic_dnode != ZFS_NO_OBJECT) {
576                 oid = idc->oic_dnode;
577                 goto zget;
578         }
579
580         rc = -ENOENT;
581         if (!list_empty(&osd->od_scrub.os_inconsistent_items))
582                 rc = osd_oii_lookup(osd, fid, &oid);
583
584         if (rc)
585                 rc = osd_fid_lookup(env, osd, fid, &oid);
586
587         if (rc == -ENOENT) {
588                 if (likely(!(fid_is_norm(fid) || fid_is_igif(fid)) ||
589                            fid_is_on_ost(env, osd, fid) ||
590                            !zfs_test_bit(osd_oi_fid2idx(osd, fid),
591                                          scrub->os_file.sf_oi_bitmap)))
592                         GOTO(out, rc = 0);
593
594                 rc = -EREMCHG;
595                 goto trigger;
596         }
597
598         if (rc)
599                 GOTO(out, rc);
600
601 zget:
602         LASSERT(obj->oo_dn == NULL);
603
604         rc = __osd_obj2dnode(osd->od_os, oid, &obj->oo_dn);
605         /* EEXIST will be returned if object is being deleted in ZFS */
606         if (rc == -EEXIST)
607                 GOTO(out, rc = 0);
608
609         if (rc) {
610                 CERROR("%s: lookup "DFID"/%#llx failed: rc = %d\n",
611                        osd->od_svname, PFID(lu_object_fid(l)), oid, rc);
612                 GOTO(out, rc);
613         }
614
615         rc = osd_object_init0(env, obj);
616         if (rc)
617                 GOTO(out, rc);
618
619         if (unlikely(obj->oo_header))
620                 GOTO(out, rc = 0);
621
622         rc = osd_check_lma(env, obj);
623         if (rc != -EREMCHG)
624                 GOTO(out, rc);
625
626         osd_scrub_refresh_mapping(env, osd, fid, oid, DTO_INDEX_DELETE, true,
627                                   NULL);
628
629 trigger:
630         /* We still have chance to get the valid dnode: for the object that is
631          * referenced by remote name entry, the object on the local MDT will be
632          * linked under the dir /REMOTE_PARENT_DIR with its FID string as name.
633          *
634          * During the OI scrub, if we cannot find the OI mapping, we may still
635          * have change to map the FID to local OID via lookup the dir
636          * /REMOTE_PARENT_DIR. */
637         if (!remote && !fid_is_on_ost(env, osd, fid)) {
638                 osd_fid2str(name, fid, sizeof(info->oti_str));
639                 rc = osd_zap_lookup(osd, osd->od_remote_parent_dir,
640                                     NULL, name, 8, 3, (void *)zde);
641                 if (!rc) {
642                         oid = zde->lzd_reg.zde_dnode;
643                         osd_dnode_rele(obj->oo_dn);
644                         obj->oo_dn = NULL;
645                         remote = true;
646                         goto zget;
647                 }
648         }
649
650         /* The case someone triggered the OI scrub already. */
651         if (scrub->os_running) {
652                 if (!rc) {
653                         LASSERT(remote);
654
655                         lu_object_set_agent_entry(l);
656                         osd_oii_insert(env, osd, fid, oid, false);
657                 } else {
658                         rc = -EINPROGRESS;
659                 }
660
661                 GOTO(out, rc);
662         }
663
664         /* The case NOT allow to trigger OI scrub automatically. */
665         if (osd->od_scrub.os_auto_scrub_interval == AS_NEVER)
666                 GOTO(out, rc);
667
668         /* It is me to trigger the OI scrub. */
669         rc1 = osd_scrub_start(env, osd, SS_CLEAR_DRYRUN |
670                               SS_CLEAR_FAILOUT | SS_AUTO_FULL);
671         CDEBUG_LIMIT(D_LFSCK | D_CONSOLE | D_WARNING,
672                      "%s: trigger OI scrub by RPC for "DFID"/%#llx: rc = %d\n",
673                      osd_name(osd), PFID(fid), oid, rc1);
674         if (!rc) {
675                 LASSERT(remote);
676
677                 lu_object_set_agent_entry(l);
678                 if (!rc1)
679                         osd_oii_insert(env, osd, fid, oid, false);
680         } else {
681                 if (!rc1)
682                         rc = -EINPROGRESS;
683                 else
684                         rc = -EREMCHG;
685         }
686
687         GOTO(out, rc);
688
689 out:
690         RETURN(rc);
691 }
692
693 /*
694  * Concurrency: no concurrent access is possible that late in object
695  * life-cycle.
696  */
697 static void osd_object_free(const struct lu_env *env, struct lu_object *l)
698 {
699         struct osd_object *obj = osd_obj(l);
700         struct lu_object_header *h = obj->oo_header;
701
702         LASSERT(osd_invariant(obj));
703
704         dt_object_fini(&obj->oo_dt);
705         /* obj doesn't contain an lu_object_header, so we don't need call_rcu */
706         OBD_SLAB_FREE_PTR(obj, osd_object_kmem);
707         if (unlikely(h))
708                 lu_object_header_free(h);
709 }
710
711 static int
712 osd_object_unlinked_add(struct osd_object *obj, struct osd_thandle *oh)
713 {
714         int rc = -EBUSY;
715
716         LASSERT(obj->oo_destroy == OSD_DESTROY_ASYNC);
717
718         /* the object is supposed to be exclusively locked by
719          * the caller (osd_destroy()), while the transaction
720          * (oh) is per-thread and not shared */
721         if (likely(list_empty(&obj->oo_unlinked_linkage))) {
722                 list_add(&obj->oo_unlinked_linkage, &oh->ot_unlinked_list);
723                 rc = 0;
724         }
725
726         return rc;
727 }
728
729 /* Default to max data size covered by a level-1 indirect block */
730 static unsigned long osd_sync_destroy_max_size =
731         1UL << (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT + SPA_MAXBLOCKSHIFT);
732 module_param(osd_sync_destroy_max_size, ulong, 0444);
733 MODULE_PARM_DESC(osd_sync_destroy_max_size, "Maximum object size to use synchronous destroy.");
734
735 static inline void
736 osd_object_set_destroy_type(struct osd_object *obj)
737 {
738         /*
739          * Lock-less OST_WRITE can race with OST_DESTROY, so set destroy type
740          * only once and use it consistently thereafter.
741          */
742         down_write(&obj->oo_guard);
743         if (obj->oo_destroy == OSD_DESTROY_NONE) {
744                 if (obj->oo_attr.la_size <= osd_sync_destroy_max_size)
745                         obj->oo_destroy = OSD_DESTROY_SYNC;
746                 else /* Larger objects are destroyed asynchronously */
747                         obj->oo_destroy = OSD_DESTROY_ASYNC;
748         }
749         up_write(&obj->oo_guard);
750 }
751
752 static int osd_declare_destroy(const struct lu_env *env, struct dt_object *dt,
753                                struct thandle *th)
754 {
755         const struct lu_fid     *fid = lu_object_fid(&dt->do_lu);
756         struct osd_object       *obj = osd_dt_obj(dt);
757         struct osd_device       *osd = osd_obj2dev(obj);
758         struct osd_thandle      *oh;
759         dnode_t *dn;
760         int                      rc;
761         uint64_t                 zapid;
762         ENTRY;
763
764         LASSERT(th != NULL);
765         LASSERT(dt_object_exists(dt));
766
767         oh = container_of(th, struct osd_thandle, ot_super);
768         LASSERT(oh->ot_tx != NULL);
769
770         dmu_tx_mark_netfree(oh->ot_tx);
771
772         /* declare that we'll remove object from fid-dnode mapping */
773         zapid = osd_get_name_n_idx(env, osd, fid, NULL, 0, &dn);
774         osd_tx_hold_zap(oh->ot_tx, zapid, dn, FALSE, NULL);
775
776         osd_declare_xattrs_destroy(env, obj, oh);
777
778         /* one less inode */
779         rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
780                                obj->oo_attr.la_gid, obj->oo_attr.la_projid,
781                                -1, oh, NULL, OSD_QID_INODE);
782         if (rc)
783                 RETURN(rc);
784
785         /* data to be truncated */
786         rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
787                                obj->oo_attr.la_gid, obj->oo_attr.la_projid,
788                                0, oh, NULL, OSD_QID_BLK);
789         if (rc)
790                 RETURN(rc);
791
792         osd_object_set_destroy_type(obj);
793         if (obj->oo_destroy == OSD_DESTROY_SYNC)
794                 dmu_tx_hold_free(oh->ot_tx, obj->oo_dn->dn_object,
795                                  0, DMU_OBJECT_END);
796         else
797                 osd_tx_hold_zap(oh->ot_tx, osd->od_unlinked->dn_object,
798                                 osd->od_unlinked, TRUE, NULL);
799
800         /* remove agent entry (if have) from remote parent */
801         if (lu_object_has_agent_entry(&obj->oo_dt.do_lu))
802                 osd_tx_hold_zap(oh->ot_tx, osd->od_remote_parent_dir,
803                                 NULL, FALSE, NULL);
804
805         /* will help to find FID->ino when this object is being
806          * added to PENDING/ */
807         osd_idc_find_and_init(env, osd, obj);
808
809         RETURN(0);
810 }
811
812 static int osd_destroy(const struct lu_env *env, struct dt_object *dt,
813                        struct thandle *th)
814 {
815         struct osd_thread_info  *info = osd_oti_get(env);
816         char                    *buf = info->oti_str;
817         struct osd_object       *obj = osd_dt_obj(dt);
818         struct osd_device       *osd = osd_obj2dev(obj);
819         const struct lu_fid     *fid = lu_object_fid(&dt->do_lu);
820         struct osd_thandle      *oh;
821         int                      rc;
822         uint64_t                 oid, zapid;
823         dnode_t *zdn;
824         ENTRY;
825
826         down_write(&obj->oo_guard);
827
828         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
829                 GOTO(out, rc = -ENOENT);
830
831         LASSERT(obj->oo_dn != NULL);
832
833         oh = container_of(th, struct osd_thandle, ot_super);
834         LASSERT(oh != NULL);
835         LASSERT(oh->ot_tx != NULL);
836
837         /* remove obj ref from index dir (it depends) */
838         zapid = osd_get_name_n_idx(env, osd, fid, buf,
839                                    sizeof(info->oti_str), &zdn);
840         rc = osd_xattrs_destroy(env, obj, oh);
841         if (rc) {
842                 CERROR("%s: cannot destroy xattrs for %s: rc = %d\n",
843                        osd->od_svname, buf, rc);
844                 GOTO(out, rc);
845         }
846
847         if (lu_object_has_agent_entry(&obj->oo_dt.do_lu)) {
848                 rc = osd_delete_from_remote_parent(env, osd, obj, oh, true);
849                 if (rc)
850                         GOTO(out, rc);
851         }
852
853         oid = obj->oo_dn->dn_object;
854         if (unlikely(obj->oo_destroy == OSD_DESTROY_NONE)) {
855                 /* this may happen if the destroy wasn't declared
856                  * e.g. when the object is created and then destroyed
857                  * in the same transaction - we don't need additional
858                  * space for destroy specifically */
859                 LASSERT(obj->oo_attr.la_size <= osd_sync_destroy_max_size);
860                 rc = -dmu_object_free(osd->od_os, oid, oh->ot_tx);
861                 if (rc)
862                         CERROR("%s: failed to free %s/%#llx: rc = %d\n",
863                                osd->od_svname, buf, oid, rc);
864         } else if (obj->oo_destroy == OSD_DESTROY_SYNC) {
865                 rc = -dmu_object_free(osd->od_os, oid, oh->ot_tx);
866                 if (rc)
867                         CERROR("%s: failed to free %s/%#llx: rc = %d\n",
868                                osd->od_svname, buf, oid, rc);
869         } else { /* asynchronous destroy */
870                 char *key = info->oti_key;
871
872                 rc = osd_object_unlinked_add(obj, oh);
873                 if (rc)
874                         GOTO(out, rc);
875
876                 snprintf(key, sizeof(info->oti_key), "%llx", oid);
877                 rc = osd_zap_add(osd, osd->od_unlinked->dn_object,
878                                  osd->od_unlinked, key, 8, 1, &oid, oh->ot_tx);
879                 if (rc)
880                         CERROR("%s: zap_add_int() failed %s/%#llx: rc = %d\n",
881                                osd->od_svname, buf, oid, rc);
882         }
883
884         /* Remove the OI mapping after the destroy to handle the race with
885          * OI scrub that may insert missed OI mapping during the interval. */
886         rc = osd_zap_remove(osd, zapid, zdn, buf, oh->ot_tx);
887         if (unlikely(rc == -ENOENT))
888                 rc = 0;
889         if (rc)
890                 CERROR("%s: zap_remove(%s) failed: rc = %d\n",
891                        osd->od_svname, buf, rc);
892
893         GOTO(out, rc);
894
895 out:
896         /* not needed in the cache anymore */
897         set_bit(LU_OBJECT_HEARD_BANSHEE, &dt->do_lu.lo_header->loh_flags);
898         if (rc == 0)
899                 obj->oo_destroyed = 1;
900         up_write(&obj->oo_guard);
901         RETURN (0);
902 }
903
904 static void osd_object_delete(const struct lu_env *env, struct lu_object *l)
905 {
906         struct osd_object *obj = osd_obj(l);
907         const struct lu_fid *fid = lu_object_fid(l);
908         dmu_buf_t **dbs = obj->oo_dbs;
909         int i;
910
911         for (i = 0; i < OSD_MAX_DBUFS; i++) {
912                 if (dbs[i])
913                         dbuf_rele((dmu_buf_impl_t *)dbs[i], osd_0copy_tag);
914         }
915
916         if (obj->oo_dn) {
917                 if (likely(!fid_is_acct(fid))) {
918                         osd_object_sa_fini(obj);
919                         if (obj->oo_sa_xattr) {
920                                 nvlist_free(obj->oo_sa_xattr);
921                                 obj->oo_sa_xattr = NULL;
922                         }
923                         osd_dnode_rele(obj->oo_dn);
924                         list_del(&obj->oo_sa_linkage);
925                 }
926                 obj->oo_dn = NULL;
927         }
928 }
929
930 /*
931  * Concurrency: ->loo_object_release() is called under site spin-lock.
932  */
933 static void osd_object_release(const struct lu_env *env,
934                                struct lu_object *l)
935 {
936 }
937
938 /*
939  * Concurrency: shouldn't matter.
940  */
941 static int osd_object_print(const struct lu_env *env, void *cookie,
942                             lu_printer_t p, const struct lu_object *l)
943 {
944         struct osd_object *o = osd_obj(l);
945
946         return (*p)(env, cookie, LUSTRE_OSD_ZFS_NAME"-object@%p", o);
947 }
948
949 static void osd_read_lock(const struct lu_env *env, struct dt_object *dt,
950                           unsigned role)
951 {
952         struct osd_object *obj = osd_dt_obj(dt);
953
954         LASSERT(osd_invariant(obj));
955
956         down_read_nested(&obj->oo_sem, role);
957 }
958
959 static void osd_write_lock(const struct lu_env *env, struct dt_object *dt,
960                            unsigned role)
961 {
962         struct osd_object *obj = osd_dt_obj(dt);
963
964         LASSERT(osd_invariant(obj));
965
966         down_write_nested(&obj->oo_sem, role);
967 }
968
969 static void osd_read_unlock(const struct lu_env *env, struct dt_object *dt)
970 {
971         struct osd_object *obj = osd_dt_obj(dt);
972
973         LASSERT(osd_invariant(obj));
974         up_read(&obj->oo_sem);
975 }
976
977 static void osd_write_unlock(const struct lu_env *env, struct dt_object *dt)
978 {
979         struct osd_object *obj = osd_dt_obj(dt);
980
981         LASSERT(osd_invariant(obj));
982         up_write(&obj->oo_sem);
983 }
984
985 static int osd_write_locked(const struct lu_env *env, struct dt_object *dt)
986 {
987         struct osd_object *obj = osd_dt_obj(dt);
988         int rc = 1;
989
990         LASSERT(osd_invariant(obj));
991
992         if (down_write_trylock(&obj->oo_sem)) {
993                 rc = 0;
994                 up_write(&obj->oo_sem);
995         }
996         return rc;
997 }
998
999 static int osd_attr_get(const struct lu_env *env, struct dt_object *dt,
1000                         struct lu_attr *attr)
1001 {
1002         struct osd_object *obj = osd_dt_obj(dt);
1003         struct osd_device *osd = osd_obj2dev(obj);
1004         uint64_t blocks;
1005         uint32_t blksize;
1006         int rc = 0;
1007
1008         down_read(&obj->oo_guard);
1009
1010         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
1011                 GOTO(out, rc = -ENOENT);
1012
1013         if (unlikely(fid_is_acct(lu_object_fid(&dt->do_lu))))
1014                 GOTO(out, rc = 0);
1015
1016         LASSERT(osd_invariant(obj));
1017         LASSERT(obj->oo_dn);
1018
1019         read_lock(&obj->oo_attr_lock);
1020         *attr = obj->oo_attr;
1021         if (obj->oo_lma_flags & LUSTRE_ORPHAN_FL) {
1022                 attr->la_valid |= LA_FLAGS;
1023                 attr->la_flags |= LUSTRE_ORPHAN_FL;
1024         }
1025         if (obj->oo_lma_flags & LUSTRE_ENCRYPT_FL) {
1026                 attr->la_valid |= LA_FLAGS;
1027                 attr->la_flags |= LUSTRE_ENCRYPT_FL;
1028         }
1029         read_unlock(&obj->oo_attr_lock);
1030         if (attr->la_valid & LA_FLAGS && attr->la_flags & LUSTRE_ORPHAN_FL)
1031                 CDEBUG(D_INFO, "%s: set orphan flag on "DFID" (%#llx/%#x)\n",
1032                        osd_obj2dev(obj)->od_svname,
1033                        PFID(lu_object_fid(&dt->do_lu)),
1034                        attr->la_valid, obj->oo_lma_flags);
1035
1036         /* with ZFS_DEBUG zrl_add_debug() called by DB_DNODE_ENTER()
1037          * from within sa_object_size() can block on a mutex, so
1038          * we can't call sa_object_size() holding rwlock */
1039         sa_object_size(obj->oo_sa_hdl, &blksize, &blocks);
1040         /* we do not control size of indices, so always calculate
1041          * it from number of blocks reported by DMU */
1042         if (S_ISDIR(attr->la_mode)) {
1043                 attr->la_size = 512 * blocks;
1044                 rc = -zap_count(osd->od_os, obj->oo_dn->dn_object,
1045                                 &attr->la_dirent_count);
1046         }
1047         /* Block size may be not set; suggest maximal I/O transfers. */
1048         if (blksize == 0)
1049                 blksize = osd_spa_maxblocksize(
1050                         dmu_objset_spa(osd_obj2dev(obj)->od_os));
1051
1052         attr->la_blksize = blksize;
1053         attr->la_blocks = blocks;
1054         attr->la_valid |= LA_BLOCKS | LA_BLKSIZE;
1055
1056 out:
1057         up_read(&obj->oo_guard);
1058         return rc;
1059 }
1060
1061 #ifdef ZFS_PROJINHERIT
1062 /*
1063  * For the existed object that is upgraded from old system, its ondisk layout
1064  * has no slot for the project ID attribute. But quota accounting logic needs
1065  * to access related slots by offset directly. So we need to adjust these old
1066  * objects' layout to make the project ID to some unified and fixed offset.
1067  */
1068 static int osd_add_projid(const struct lu_env *env, struct osd_object *obj,
1069                           struct osd_thandle *oh, uint64_t projid)
1070 {
1071         sa_bulk_attr_t *bulk = osd_oti_get(env)->oti_attr_bulk;
1072         struct osa_attr *osa = &osd_oti_get(env)->oti_osa;
1073         struct osd_device *osd = osd_obj2dev(obj);
1074         uint64_t gen;
1075         size_t sa_size;
1076         char *dxattr = NULL;
1077         int rc, cnt;
1078
1079         rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_PROJID(osd), &osa->projid, 8);
1080         if (unlikely(rc == 0))
1081                 rc = -EEXIST;
1082         if (rc != -ENOENT)
1083                 GOTO(out, rc);
1084
1085         gen = dmu_tx_get_txg(oh->ot_tx);
1086         osa->atime[0] = obj->oo_attr.la_atime;
1087         osa->ctime[0] = obj->oo_attr.la_ctime;
1088         osa->mtime[0] = obj->oo_attr.la_mtime;
1089         osa->btime[0] = obj->oo_attr.la_btime;
1090         osa->mode = obj->oo_attr.la_mode;
1091         osa->uid = obj->oo_attr.la_uid;
1092         osa->gid = obj->oo_attr.la_gid;
1093         osa->rdev = obj->oo_attr.la_rdev;
1094         osa->nlink = obj->oo_attr.la_nlink;
1095         osa->flags = attrs_fs2zfs(obj->oo_attr.la_flags) | ZFS_PROJID;
1096         osa->size  = obj->oo_attr.la_size;
1097         osa->projid = projid;
1098
1099         cnt = 0;
1100         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(osd), NULL, &osa->mode, 8);
1101         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(osd), NULL, &osa->size, 8);
1102         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GEN(osd), NULL, &gen, 8);
1103         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(osd), NULL, &osa->uid, 8);
1104         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(osd), NULL, &osa->gid, 8);
1105         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PARENT(osd), NULL,
1106                          &obj->oo_parent, 8);
1107         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(osd), NULL, &osa->flags, 8);
1108         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(osd), NULL, osa->atime, 16);
1109         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(osd), NULL, osa->mtime, 16);
1110         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(osd), NULL, osa->ctime, 16);
1111         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CRTIME(osd), NULL, osa->btime, 16);
1112         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(osd), NULL, &osa->nlink, 8);
1113         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PROJID(osd), NULL, &osa->projid, 8);
1114         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(osd), NULL, &osa->rdev, 8);
1115         LASSERT(cnt <= ARRAY_SIZE(osd_oti_get(env)->oti_attr_bulk));
1116
1117         if (obj->oo_sa_xattr == NULL) {
1118                 rc = __osd_xattr_load(osd, obj->oo_sa_hdl, &obj->oo_sa_xattr);
1119                 if (rc)
1120                         GOTO(out, rc);
1121         }
1122
1123         if (obj->oo_sa_xattr) {
1124                 rc = -nvlist_size(obj->oo_sa_xattr, &sa_size, NV_ENCODE_XDR);
1125                 if (rc)
1126                         GOTO(out, rc);
1127
1128                 dxattr = osd_zio_buf_alloc(sa_size);
1129                 if (dxattr == NULL)
1130                         GOTO(out, rc = -ENOMEM);
1131
1132                 rc = -nvlist_pack(obj->oo_sa_xattr, &dxattr, &sa_size,
1133                                 NV_ENCODE_XDR, KM_SLEEP);
1134                 if (rc)
1135                         GOTO(out, rc);
1136
1137                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_DXATTR(osd),
1138                                 NULL, dxattr, sa_size);
1139         }
1140
1141         rc = -sa_replace_all_by_template(obj->oo_sa_hdl, bulk, cnt, oh->ot_tx);
1142 out:
1143         if (dxattr)
1144                 osd_zio_buf_free(dxattr, sa_size);
1145         return rc;
1146 }
1147 #endif
1148
1149 static int osd_declare_attr_set(const struct lu_env *env,
1150                                 struct dt_object *dt,
1151                                 const struct lu_attr *attr,
1152                                 struct thandle *handle)
1153 {
1154         struct osd_thread_info  *info = osd_oti_get(env);
1155         struct osd_object       *obj = osd_dt_obj(dt);
1156         struct osd_device       *osd = osd_obj2dev(obj);
1157         dmu_tx_hold_t           *txh;
1158         struct osd_thandle      *oh;
1159         uint64_t                 bspace;
1160         uint32_t                 blksize;
1161         int                      rc = 0;
1162         bool                     found;
1163         ENTRY;
1164
1165
1166         LASSERT(handle != NULL);
1167         LASSERT(osd_invariant(obj));
1168
1169         oh = container_of(handle, struct osd_thandle, ot_super);
1170
1171         down_read(&obj->oo_guard);
1172         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
1173                 GOTO(out_sem, rc = 0);
1174
1175         LASSERT(obj->oo_sa_hdl != NULL);
1176         LASSERT(oh->ot_tx != NULL);
1177         /* regular attributes are part of the bonus buffer */
1178         /* let's check whether this object is already part of
1179          * transaction.. */
1180         found = false;
1181         for (txh = list_head(&oh->ot_tx->tx_holds); txh;
1182              txh = list_next(&oh->ot_tx->tx_holds, txh)) {
1183                 if (txh->txh_dnode == NULL)
1184                         continue;
1185                 if (txh->txh_dnode->dn_object != obj->oo_dn->dn_object)
1186                         continue;
1187                 /* this object is part of the transaction already
1188                  * we don't need to declare bonus again */
1189                 found = true;
1190                 break;
1191         }
1192         if (!found)
1193                 dmu_tx_hold_bonus(oh->ot_tx, obj->oo_dn->dn_object);
1194         if (oh->ot_tx->tx_err != 0)
1195                 GOTO(out_sem, rc = -oh->ot_tx->tx_err);
1196
1197         if (attr && attr->la_valid & LA_FLAGS) {
1198                 /* punch must be aware we are dealing with an encrypted file */
1199                 if (attr->la_flags & LUSTRE_ENCRYPT_FL)
1200                         obj->oo_lma_flags |= LUSTRE_ENCRYPT_FL;
1201         }
1202
1203         if (attr && (attr->la_valid & (LA_UID | LA_GID | LA_PROJID))) {
1204                 sa_object_size(obj->oo_sa_hdl, &blksize, &bspace);
1205                 bspace = toqb(bspace * 512);
1206
1207                 CDEBUG(D_QUOTA,
1208                        "%s: enforce quota on UID %u, GID %u, the quota space is %lld (%u)\n",
1209                        osd->od_svname,
1210                        attr->la_uid, attr->la_gid, bspace, blksize);
1211         }
1212         /* to preserve locking order - qsd_transfer() may need to flush
1213          * currently running transaction when we're out of quota.
1214          */
1215         up_read(&obj->oo_guard);
1216
1217         /* quota enforcement for user */
1218         if (attr && attr->la_valid & LA_UID &&
1219             attr->la_uid != obj->oo_attr.la_uid) {
1220                 rc = qsd_transfer(env, osd_def_qsd(osd),
1221                                   &oh->ot_quota_trans, USRQUOTA,
1222                                   obj->oo_attr.la_uid, attr->la_uid,
1223                                   bspace, &info->oti_qi);
1224                 if (rc)
1225                         GOTO(out, rc);
1226         }
1227
1228         /* quota enforcement for group */
1229         if (attr && attr->la_valid & LA_GID &&
1230             attr->la_gid != obj->oo_attr.la_gid) {
1231                 rc = qsd_transfer(env, osd_def_qsd(osd),
1232                                   &oh->ot_quota_trans, GRPQUOTA,
1233                                   obj->oo_attr.la_gid, attr->la_gid,
1234                                   bspace, &info->oti_qi);
1235                 if (rc)
1236                         GOTO(out, rc);
1237         }
1238 #ifdef ZFS_PROJINHERIT
1239         /* quota enforcement for project */
1240         if (attr && attr->la_valid & LA_PROJID &&
1241             attr->la_projid != obj->oo_attr.la_projid) {
1242                 if (!osd->od_projectused_dn)
1243                         GOTO(out, rc = -EOPNOTSUPP);
1244
1245                 if (!zpl_is_valid_projid(attr->la_projid))
1246                         GOTO(out, rc = -EINVAL);
1247
1248                 rc = qsd_transfer(env, osd_def_qsd(osd),
1249                                   &oh->ot_quota_trans, PRJQUOTA,
1250                                   obj->oo_attr.la_projid,
1251                                   attr->la_projid, bspace,
1252                                   &info->oti_qi);
1253                 if (rc)
1254                         GOTO(out, rc);
1255         }
1256 #endif
1257 out:
1258         RETURN(rc);
1259 out_sem:
1260         up_read(&obj->oo_guard);
1261         RETURN(rc);
1262 }
1263
1264 /*
1265  * Set the attributes of an object
1266  *
1267  * The transaction passed to this routine must have
1268  * dmu_tx_hold_bonus(tx, oid) called and then assigned
1269  * to a transaction group.
1270  */
1271 static int osd_attr_set(const struct lu_env *env, struct dt_object *dt,
1272                         const struct lu_attr *la, struct thandle *handle)
1273 {
1274         struct osd_thread_info  *info = osd_oti_get(env);
1275         sa_bulk_attr_t          *bulk = osd_oti_get(env)->oti_attr_bulk;
1276         struct osd_object       *obj = osd_dt_obj(dt);
1277         struct osd_device       *osd = osd_obj2dev(obj);
1278         struct osd_thandle      *oh;
1279         struct osa_attr         *osa = &info->oti_osa;
1280         __u64                    valid = la->la_valid;
1281         int                      cnt;
1282         int                      rc = 0;
1283
1284         ENTRY;
1285
1286         down_read(&obj->oo_guard);
1287         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
1288                 GOTO(out, rc = -ENOENT);
1289
1290         LASSERT(handle != NULL);
1291         LASSERT(osd_invariant(obj));
1292         LASSERT(obj->oo_sa_hdl);
1293
1294         oh = container_of(handle, struct osd_thandle, ot_super);
1295         /* Assert that the transaction has been assigned to a
1296            transaction group. */
1297         LASSERT(oh->ot_tx->tx_txg != 0);
1298
1299         if (CFS_FAIL_CHECK(OBD_FAIL_OSD_FID_MAPPING) && !osd->od_is_ost) {
1300                 struct zpl_direntry *zde = &info->oti_zde.lzd_reg;
1301                 char *buf = info->oti_str;
1302                 dnode_t *zdn = NULL;
1303                 uint64_t zapid;
1304
1305                 zapid = osd_get_name_n_idx(env, osd, lu_object_fid(&dt->do_lu),
1306                                            buf, sizeof(info->oti_str), &zdn);
1307                 rc = osd_zap_lookup(osd, zapid, zdn, buf, 8,
1308                                     sizeof(*zde) / 8, zde);
1309                 if (!rc) {
1310                         zde->zde_dnode -= 1;
1311                         rc = -zap_update(osd->od_os, zapid, buf, 8,
1312                                          sizeof(*zde) / 8, zde, oh->ot_tx);
1313                 }
1314                 if (rc > 0)
1315                         rc = 0;
1316                 GOTO(out, rc);
1317         }
1318
1319         /* Only allow set size for regular file */
1320         if (!S_ISREG(dt->do_lu.lo_header->loh_attr))
1321                 valid &= ~(LA_SIZE | LA_BLOCKS);
1322
1323         if (valid & LA_CTIME && la->la_ctime == obj->oo_attr.la_ctime)
1324                 valid &= ~LA_CTIME;
1325
1326         if (valid & LA_MTIME && la->la_mtime == obj->oo_attr.la_mtime)
1327                 valid &= ~LA_MTIME;
1328
1329         if (valid & LA_ATIME && la->la_atime == obj->oo_attr.la_atime)
1330                 valid &= ~LA_ATIME;
1331
1332         if (valid == 0)
1333                 GOTO(out, rc = 0);
1334
1335         if (valid & LA_FLAGS) {
1336                 struct lustre_mdt_attrs *lma;
1337                 struct lu_buf buf;
1338                 int size = 0;
1339
1340                 if (la->la_flags & LUSTRE_LMA_FL_MASKS) {
1341                         LASSERT(!obj->oo_pfid_in_lma);
1342                         BUILD_BUG_ON(sizeof(info->oti_buf) < sizeof(*lma));
1343                         lma = (struct lustre_mdt_attrs *)&info->oti_buf;
1344                         buf.lb_buf = lma;
1345                         buf.lb_len = sizeof(info->oti_buf);
1346
1347                         /* Please do NOT call osd_xattr_get() directly, that
1348                          * will cause recursive down_read() on oo_guard. */
1349                         rc = osd_xattr_get_internal(env, obj, &buf,
1350                                                     XATTR_NAME_LMA, &size);
1351                         if (!rc && unlikely(size < sizeof(*lma))) {
1352                                 rc = -EINVAL;
1353                         } else if (!rc) {
1354                                 lma->lma_incompat =
1355                                         le32_to_cpu(lma->lma_incompat);
1356
1357                                 if ((lma->lma_incompat &
1358                                      lustre_to_lma_flags(la->la_flags)) ==
1359                                     lustre_to_lma_flags(la->la_flags))
1360                                         /* save a useless xattr set if lma
1361                                          * incompat already has the flags
1362                                          */
1363                                         GOTO(lock, rc = 0);
1364
1365                                 lma->lma_incompat |=
1366                                         lustre_to_lma_flags(la->la_flags);
1367                                 lma->lma_incompat =
1368                                         cpu_to_le32(lma->lma_incompat);
1369                                 buf.lb_buf = lma;
1370                                 buf.lb_len = sizeof(*lma);
1371                                 rc = osd_xattr_set_internal(env, obj, &buf,
1372                                                             XATTR_NAME_LMA,
1373                                                             LU_XATTR_REPLACE,
1374                                                             oh);
1375                         }
1376                         if (rc < 0) {
1377                                 CWARN("%s: failed to set LMA flags: rc = %d\n",
1378                                        osd->od_svname, rc);
1379                                 GOTO(out, rc);
1380                         } else {
1381                                 obj->oo_lma_flags =
1382                                         la->la_flags & LUSTRE_LMA_FL_MASKS;
1383                         }
1384                 }
1385         }
1386
1387 lock:
1388         write_lock(&obj->oo_attr_lock);
1389         cnt = 0;
1390
1391         if (valid & LA_PROJID) {
1392 #ifdef ZFS_PROJINHERIT
1393                 if (osd->od_projectused_dn) {
1394                         if (obj->oo_with_projid) {
1395                                 osa->projid  = la->la_projid;
1396                                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PROJID(osd),
1397                                                  NULL, &osa->projid, 8);
1398                         } else {
1399                                 rc = osd_add_projid(env, obj, oh,
1400                                                     la->la_projid);
1401                                 if (unlikely(rc == -EEXIST)) {
1402                                         rc = 0;
1403                                 } else if (rc != 0) {
1404                                         write_unlock(&obj->oo_attr_lock);
1405                                         GOTO(out, rc);
1406                                 }
1407                                 obj->oo_with_projid = 1;
1408                         }
1409                         obj->oo_attr.la_projid = la->la_projid;
1410                 } else
1411 #endif
1412                         valid &= ~LA_PROJID;
1413         }
1414
1415         if (valid & LA_ATIME) {
1416                 osa->atime[0] = obj->oo_attr.la_atime = la->la_atime;
1417                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(osd), NULL,
1418                                  osa->atime, 16);
1419         }
1420         if (valid & LA_MTIME) {
1421                 osa->mtime[0] = obj->oo_attr.la_mtime = la->la_mtime;
1422                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(osd), NULL,
1423                                  osa->mtime, 16);
1424         }
1425         if (valid & LA_CTIME) {
1426                 osa->ctime[0] = obj->oo_attr.la_ctime = la->la_ctime;
1427                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(osd), NULL,
1428                                  osa->ctime, 16);
1429         }
1430         if (valid & LA_MODE) {
1431                 /* mode is stored along with type, so read it first */
1432                 obj->oo_attr.la_mode = (obj->oo_attr.la_mode & S_IFMT) |
1433                         (la->la_mode & ~S_IFMT);
1434                 osa->mode = obj->oo_attr.la_mode;
1435                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(osd), NULL,
1436                                  &osa->mode, 8);
1437         }
1438         if (valid & LA_SIZE) {
1439                 osa->size = obj->oo_attr.la_size = la->la_size;
1440                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(osd), NULL,
1441                                  &osa->size, 8);
1442         }
1443         if (valid & LA_NLINK) {
1444                 osa->nlink = obj->oo_attr.la_nlink = la->la_nlink;
1445                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(osd), NULL,
1446                                  &osa->nlink, 8);
1447         }
1448         if (valid & LA_RDEV) {
1449                 osa->rdev = obj->oo_attr.la_rdev = la->la_rdev;
1450                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(osd), NULL,
1451                                  &osa->rdev, 8);
1452         }
1453         if (valid & LA_FLAGS) {
1454                 osa->flags = attrs_fs2zfs(la->la_flags);
1455                 /* many flags are not supported by zfs, so ensure a good cached
1456                  * copy */
1457                 obj->oo_attr.la_flags = attrs_zfs2fs(osa->flags);
1458 #ifdef ZFS_PROJINHERIT
1459                 if (obj->oo_with_projid && osd->od_projectused_dn)
1460                         osa->flags |= ZFS_PROJID;
1461 #endif
1462                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(osd), NULL,
1463                                  &osa->flags, 8);
1464         }
1465         if (valid & LA_UID) {
1466                 osa->uid = obj->oo_attr.la_uid = la->la_uid;
1467                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(osd), NULL,
1468                                  &osa->uid, 8);
1469         }
1470         if (valid & LA_GID) {
1471                 osa->gid = obj->oo_attr.la_gid = la->la_gid;
1472                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(osd), NULL,
1473                                  &osa->gid, 8);
1474         }
1475         obj->oo_attr.la_valid |= valid;
1476         write_unlock(&obj->oo_attr_lock);
1477
1478         LASSERT(cnt <= ARRAY_SIZE(osd_oti_get(env)->oti_attr_bulk));
1479         rc = osd_object_sa_bulk_update(obj, bulk, cnt, oh);
1480
1481 out:
1482         up_read(&obj->oo_guard);
1483         RETURN(rc);
1484 }
1485
1486 /*
1487  * Object creation.
1488  *
1489  * XXX temporary solution.
1490  */
1491
1492 static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah,
1493                         struct dt_object *parent, struct dt_object *child,
1494                         umode_t child_mode)
1495 {
1496         LASSERT(ah);
1497
1498         ah->dah_parent = parent;
1499
1500         if (parent != NULL && !dt_object_remote(parent)) {
1501                 /* will help to find FID->ino at dt_insert("..") */
1502                 struct osd_object *pobj = osd_dt_obj(parent);
1503
1504                 osd_idc_find_and_init(env, osd_obj2dev(pobj), pobj);
1505         }
1506 }
1507
1508 static int osd_declare_create(const struct lu_env *env, struct dt_object *dt,
1509                               struct lu_attr *attr,
1510                               struct dt_allocation_hint *hint,
1511                               struct dt_object_format *dof,
1512                               struct thandle *handle)
1513 {
1514         const struct lu_fid     *fid = lu_object_fid(&dt->do_lu);
1515         struct osd_object       *obj = osd_dt_obj(dt);
1516         struct osd_device       *osd = osd_obj2dev(obj);
1517         struct osd_thandle      *oh;
1518         uint64_t                 zapid;
1519         dnode_t                 *dn;
1520         int                      rc, dnode_size;
1521         ENTRY;
1522
1523         LASSERT(dof);
1524
1525         switch (dof->dof_type) {
1526                 case DFT_REGULAR:
1527                 case DFT_SYM:
1528                 case DFT_NODE:
1529                         if (obj->oo_dt.do_body_ops == NULL)
1530                                 obj->oo_dt.do_body_ops = &osd_body_ops;
1531                         break;
1532                 default:
1533                         break;
1534         }
1535
1536         LASSERT(handle != NULL);
1537         oh = container_of(handle, struct osd_thandle, ot_super);
1538         LASSERT(oh->ot_tx != NULL);
1539
1540         /* this is the minimum set of EAs on every Lustre object */
1541         obj->oo_ea_in_bonus = OSD_BASE_EA_IN_BONUS;
1542         /* reserve 32 bytes for extra stuff like ACLs */
1543         dnode_size = size_roundup_power2(obj->oo_ea_in_bonus + 32);
1544
1545         switch (dof->dof_type) {
1546                 case DFT_DIR:
1547                         dt->do_index_ops = &osd_dir_ops;
1548                         fallthrough;
1549                 case DFT_INDEX:
1550                         /* for zap create */
1551                         dmu_tx_hold_zap(oh->ot_tx, DMU_NEW_OBJECT, FALSE, NULL);
1552                         dmu_tx_hold_sa_create(oh->ot_tx, dnode_size);
1553                         break;
1554                 case DFT_REGULAR:
1555                 case DFT_SYM:
1556                 case DFT_NODE:
1557                         /* first, we'll create new object */
1558                         dmu_tx_hold_sa_create(oh->ot_tx, dnode_size);
1559                         break;
1560
1561                 default:
1562                         LBUG();
1563                         break;
1564         }
1565
1566         /* and we'll add it to some mapping */
1567         zapid = osd_get_name_n_idx(env, osd, fid, NULL, 0, &dn);
1568         osd_tx_hold_zap(oh->ot_tx, zapid, dn, TRUE, NULL);
1569
1570         /* will help to find FID->ino mapping at dt_insert() */
1571         osd_idc_find_and_init(env, osd, obj);
1572
1573         rc = osd_declare_quota(env, osd, attr->la_uid, attr->la_gid,
1574                                attr->la_projid, 1, oh, NULL, OSD_QID_INODE);
1575
1576         RETURN(rc);
1577 }
1578
1579 int __osd_attr_init(const struct lu_env *env, struct osd_device *osd,
1580                     struct osd_object *obj, sa_handle_t *sa_hdl, dmu_tx_t *tx,
1581                     struct lu_attr *la, uint64_t parent,
1582                     nvlist_t *xattr)
1583 {
1584         sa_bulk_attr_t *bulk = osd_oti_get(env)->oti_attr_bulk;
1585         struct osa_attr *osa = &osd_oti_get(env)->oti_osa;
1586         uint64_t gen;
1587         inode_timespec_t now;
1588         int cnt;
1589         int rc;
1590         char *dxattr = NULL;
1591         size_t sa_size;
1592
1593
1594         LASSERT(sa_hdl);
1595
1596         gen = dmu_tx_get_txg(tx);
1597         gethrestime(&now);
1598         ZFS_TIME_ENCODE(&now, osa->btime);
1599
1600         osa->atime[0] = la->la_atime;
1601         osa->ctime[0] = la->la_ctime;
1602         osa->mtime[0] = la->la_mtime;
1603         osa->mode = la->la_mode;
1604         osa->uid = la->la_uid;
1605         osa->gid = la->la_gid;
1606         osa->rdev = la->la_rdev;
1607         osa->nlink = la->la_nlink;
1608         if (la->la_valid & LA_FLAGS)
1609                 osa->flags = attrs_fs2zfs(la->la_flags);
1610         else
1611                 osa->flags = 0;
1612         osa->size  = la->la_size;
1613 #ifdef ZFS_PROJINHERIT
1614         if (osd->od_projectused_dn) {
1615                 if (la->la_valid & LA_PROJID)
1616                         osa->projid = la->la_projid;
1617                 else
1618                         osa->projid = ZFS_DEFAULT_PROJID;
1619                 osa->flags |= ZFS_PROJID;
1620                 if (obj)
1621                         obj->oo_with_projid = 1;
1622         } else {
1623                 osa->flags &= ~ZFS_PROJID;
1624         }
1625 #endif
1626
1627         /*
1628          * we need to create all SA below upon object create.
1629          *
1630          * XXX The attribute order matters since the accounting callback relies
1631          * on static offsets (i.e. SA_*_OFFSET, see zfs_space_delta_cb()) to
1632          * look up the UID/GID/PROJID attributes. Moreover, the callback does
1633          * not seem to support the spill block.
1634          * We define attributes in the same order as SA_*_OFFSET in order to
1635          * work around the problem. See ORI-610.
1636          */
1637         cnt = 0;
1638         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(osd), NULL, &osa->mode, 8);
1639         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(osd), NULL, &osa->size, 8);
1640         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GEN(osd), NULL, &gen, 8);
1641         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(osd), NULL, &osa->uid, 8);
1642         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(osd), NULL, &osa->gid, 8);
1643         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PARENT(osd), NULL, &parent, 8);
1644         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(osd), NULL, &osa->flags, 8);
1645         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(osd), NULL, osa->atime, 16);
1646         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(osd), NULL, osa->mtime, 16);
1647         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(osd), NULL, osa->ctime, 16);
1648         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CRTIME(osd), NULL, osa->btime, 16);
1649         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(osd), NULL, &osa->nlink, 8);
1650 #ifdef ZFS_PROJINHERIT
1651         if (osd->od_projectused_dn)
1652                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PROJID(osd), NULL,
1653                                  &osa->projid, 8);
1654 #endif
1655         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(osd), NULL, &osa->rdev, 8);
1656         LASSERT(cnt <= ARRAY_SIZE(osd_oti_get(env)->oti_attr_bulk));
1657
1658         if (xattr) {
1659                 rc = -nvlist_size(xattr, &sa_size, NV_ENCODE_XDR);
1660                 LASSERT(rc == 0);
1661
1662                 dxattr = osd_zio_buf_alloc(sa_size);
1663                 LASSERT(dxattr);
1664
1665                 rc = -nvlist_pack(xattr, &dxattr, &sa_size,
1666                                 NV_ENCODE_XDR, KM_SLEEP);
1667                 LASSERT(rc == 0);
1668
1669                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_DXATTR(osd),
1670                                 NULL, dxattr, sa_size);
1671         }
1672
1673         rc = -sa_replace_all_by_template(sa_hdl, bulk, cnt, tx);
1674         if (dxattr)
1675                 osd_zio_buf_free(dxattr, sa_size);
1676
1677         return rc;
1678 }
1679
1680 int osd_find_new_dnode(const struct lu_env *env, dmu_tx_t *tx,
1681                        uint64_t oid, dnode_t **dnp)
1682 {
1683         dmu_tx_hold_t *txh;
1684         int rc = 0;
1685
1686         /* take dnode_t from tx to save on dnode#->dnode_t lookup */
1687         for (txh = list_tail(&tx->tx_holds); txh;
1688              txh = list_prev(&tx->tx_holds, txh)) {
1689                 dnode_t *dn = txh->txh_dnode;
1690                 dmu_buf_impl_t *db;
1691
1692                 if (dn == NULL)
1693                         continue;
1694                 if (dn->dn_object != oid)
1695                         continue;
1696                 db = dn->dn_bonus;
1697                 if (db == NULL) {
1698                         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1699                         if (dn->dn_bonus == NULL)
1700                                 dbuf_create_bonus(dn);
1701                         rw_exit(&dn->dn_struct_rwlock);
1702                 }
1703                 db = dn->dn_bonus;
1704                 LASSERT(db);
1705                 LASSERT(dn->dn_handle);
1706                 DB_DNODE_ENTER(db);
1707                 if (zfs_refcount_add(&db->db_holds, osd_obj_tag) == 1) {
1708                         zfs_refcount_add(&dn->dn_holds, osd_obj_tag);
1709                         atomic_inc_32(&dn->dn_dbufs_count);
1710                 }
1711                 *dnp = dn;
1712                 DB_DNODE_EXIT(db);
1713                 dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH);
1714                 break;
1715         }
1716
1717         if (unlikely(*dnp == NULL))
1718                 rc = __osd_obj2dnode(tx->tx_objset, oid, dnp);
1719
1720         return rc;
1721 }
1722
1723 #ifdef HAVE_DMU_OBJECT_ALLOC_DNSIZE
1724 int osd_find_dnsize(struct osd_device *osd, int ea_in_bonus)
1725 {
1726         int dnsize;
1727
1728         if (osd->od_dnsize == ZFS_DNSIZE_AUTO) {
1729                 dnsize = DNODE_MIN_SIZE;
1730                 do {
1731                         if (DN_BONUS_SIZE(dnsize) >= ea_in_bonus + 32)
1732                                 break;
1733                         dnsize <<= 1;
1734                 } while (dnsize < DNODE_MAX_SIZE);
1735                 if (dnsize > DNODE_MAX_SIZE)
1736                         dnsize = DNODE_MAX_SIZE;
1737         } else if (osd->od_dnsize == ZFS_DNSIZE_1K) {
1738                 dnsize = 1024;
1739         } else if (osd->od_dnsize == ZFS_DNSIZE_2K) {
1740                 dnsize = 2048;
1741         } else if (osd->od_dnsize == ZFS_DNSIZE_4K) {
1742                 dnsize = 4096;
1743         } else if (osd->od_dnsize == ZFS_DNSIZE_8K) {
1744                 dnsize = 8192;
1745         } else if (osd->od_dnsize == ZFS_DNSIZE_16K) {
1746                 dnsize = 16384;
1747         } else {
1748                 dnsize = DNODE_MIN_SIZE;
1749         }
1750         return dnsize;
1751 }
1752 #endif
1753
1754 /*
1755  * The transaction passed to this routine must have
1756  * dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT) called and then assigned
1757  * to a transaction group.
1758  */
1759 int __osd_object_create(const struct lu_env *env, struct osd_device *osd,
1760                         struct osd_object *obj, const struct lu_fid *fid,
1761                         dnode_t **dnp, dmu_tx_t *tx, struct lu_attr *la)
1762 {
1763         dmu_object_type_t type = DMU_OT_PLAIN_FILE_CONTENTS;
1764         uint64_t oid;
1765         int size;
1766
1767         /* Use DMU_OTN_UINT8_METADATA for local objects so their data blocks
1768          * would get an additional ditto copy */
1769         if (unlikely(S_ISREG(la->la_mode) &&
1770                      fid_seq_is_local_file(fid_seq(fid))))
1771                 type = DMU_OTN_UINT8_METADATA;
1772
1773         /* Create a new DMU object using the default dnode size. */
1774         if (obj)
1775                 size = obj->oo_ea_in_bonus;
1776         else
1777                 size = OSD_BASE_EA_IN_BONUS;
1778         oid = osd_dmu_object_alloc(osd->od_os, type, 0,
1779                                    osd_find_dnsize(osd, size), tx);
1780
1781         LASSERT(la->la_valid & LA_MODE);
1782         la->la_size = 0;
1783         la->la_nlink = 1;
1784
1785         return osd_find_new_dnode(env, tx, oid, dnp);
1786 }
1787
1788 /*
1789  * The transaction passed to this routine must have
1790  * dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, ...) called and then assigned
1791  * to a transaction group.
1792  *
1793  * Using ZAP_FLAG_HASH64 will force the ZAP to always be a FAT ZAP.
1794  * This is fine for directories today, because storing the FID in the dirent
1795  * will also require a FAT ZAP.  If there is a new type of micro ZAP created
1796  * then we might need to re-evaluate the use of this flag and instead do
1797  * a conversion from the different internal ZAP hash formats being used. */
1798 int __osd_zap_create(const struct lu_env *env, struct osd_device *osd,
1799                      dnode_t **dnp, dmu_tx_t *tx, struct lu_attr *la,
1800                      unsigned dnsize, zap_flags_t flags)
1801 {
1802         uint64_t oid;
1803
1804         /* Assert that the transaction has been assigned to a
1805            transaction group. */
1806         LASSERT(tx->tx_txg != 0);
1807         *dnp = NULL;
1808
1809         oid = osd_zap_create_flags(osd->od_os, 0, flags | ZAP_FLAG_HASH64,
1810                                    DMU_OT_DIRECTORY_CONTENTS,
1811                                    14, /* == ZFS fzap_default_blockshift */
1812                                    DN_MAX_INDBLKSHIFT, /* indirect blockshift */
1813                                    dnsize, tx);
1814
1815         la->la_size = 2;
1816         la->la_nlink = 1;
1817
1818         return osd_find_new_dnode(env, tx, oid, dnp);
1819 }
1820
1821 static dnode_t *osd_mkidx(const struct lu_env *env, struct osd_object *obj,
1822                           struct lu_attr *la, struct osd_thandle *oh)
1823 {
1824         struct osd_device *osd = osd_obj2dev(obj);
1825         dnode_t *dn;
1826         int rc;
1827
1828         /* Index file should be created as regular file in order not to confuse
1829          * ZPL which could interpret them as directory.
1830          * We set ZAP_FLAG_UINT64_KEY to let ZFS know than we are going to use
1831          * binary keys */
1832         LASSERT(S_ISREG(la->la_mode));
1833         rc = __osd_zap_create(env, osd, &dn, oh->ot_tx, la,
1834                 osd_find_dnsize(osd, obj->oo_ea_in_bonus), ZAP_FLAG_UINT64_KEY);
1835         if (rc)
1836                 return ERR_PTR(rc);
1837         return dn;
1838 }
1839
1840 static dnode_t *osd_mkdir(const struct lu_env *env, struct osd_object *obj,
1841                           struct lu_attr *la, struct osd_thandle *oh)
1842 {
1843         struct osd_device *osd = osd_obj2dev(obj);
1844         dnode_t *dn;
1845         int rc;
1846
1847         LASSERT(S_ISDIR(la->la_mode));
1848         rc = __osd_zap_create(env, osd, &dn, oh->ot_tx, la,
1849                               osd_find_dnsize(osd, obj->oo_ea_in_bonus), 0);
1850         if (rc)
1851                 return ERR_PTR(rc);
1852         return dn;
1853 }
1854
1855 static dnode_t *osd_mkreg(const struct lu_env *env, struct osd_object *obj,
1856                           struct lu_attr *la, struct osd_thandle *oh)
1857 {
1858         const struct lu_fid *fid = lu_object_fid(&obj->oo_dt.do_lu);
1859         struct osd_device *osd = osd_obj2dev(obj);
1860         dnode_t *dn;
1861         int rc;
1862
1863         LASSERT(S_ISREG(la->la_mode));
1864         rc = __osd_object_create(env, osd, obj, fid, &dn, oh->ot_tx, la);
1865         if (rc)
1866                 return ERR_PTR(rc);
1867
1868         if ((fid_is_idif(fid) || fid_is_norm(fid) || fid_is_echo(fid))) {
1869                 /* The minimum block size must be at least page size otherwise
1870                  * it will break the assumption in tgt_thread_big_cache where
1871                  * the array size is PTLRPC_MAX_BRW_PAGES. It will also affect
1872                  * RDMA due to subpage transfer size */
1873                 rc = -dmu_object_set_blocksize(osd->od_os, dn->dn_object,
1874                                                PAGE_SIZE, 0, oh->ot_tx);
1875                 if (unlikely(rc)) {
1876                         CERROR("%s: can't change blocksize: %d\n",
1877                                osd->od_svname, rc);
1878                         return ERR_PTR(rc);
1879                 }
1880         } else if ((fid_is_llog(fid))) {
1881                 rc = -dmu_object_set_blocksize(osd->od_os, dn->dn_object,
1882                                                LLOG_MIN_CHUNK_SIZE, 0, oh->ot_tx);
1883                 if (unlikely(rc)) {
1884                         CERROR("%s: can't change blocksize: %d\n",
1885                                osd->od_svname, rc);
1886                         return ERR_PTR(rc);
1887                 }
1888         }
1889
1890         return dn;
1891 }
1892
1893 static dnode_t *osd_mksym(const struct lu_env *env, struct osd_object *obj,
1894                           struct lu_attr *la, struct osd_thandle *oh)
1895 {
1896         dnode_t *dn;
1897         int rc;
1898
1899         LASSERT(S_ISLNK(la->la_mode));
1900         rc = __osd_object_create(env, osd_obj2dev(obj), obj,
1901                                  lu_object_fid(&obj->oo_dt.do_lu),
1902                                  &dn, oh->ot_tx, la);
1903         if (rc)
1904                 return ERR_PTR(rc);
1905         return dn;
1906 }
1907
1908 static dnode_t *osd_mknod(const struct lu_env *env, struct osd_object *obj,
1909                           struct lu_attr *la, struct osd_thandle *oh)
1910 {
1911         dnode_t *dn;
1912         int rc;
1913
1914         if (S_ISCHR(la->la_mode) || S_ISBLK(la->la_mode))
1915                 la->la_valid |= LA_RDEV;
1916
1917         rc = __osd_object_create(env, osd_obj2dev(obj), obj,
1918                                  lu_object_fid(&obj->oo_dt.do_lu),
1919                                  &dn, oh->ot_tx, la);
1920         if (rc)
1921                 return ERR_PTR(rc);
1922         return dn;
1923 }
1924
1925 typedef dnode_t *(*osd_obj_type_f)(const struct lu_env *env,
1926                                    struct osd_object *obj,
1927                                    struct lu_attr *la,
1928                                    struct osd_thandle *oh);
1929
1930 static osd_obj_type_f osd_create_type_f(enum dt_format_type type)
1931 {
1932         osd_obj_type_f result;
1933
1934         switch (type) {
1935         case DFT_DIR:
1936                 result = osd_mkdir;
1937                 break;
1938         case DFT_INDEX:
1939                 result = osd_mkidx;
1940                 break;
1941         case DFT_REGULAR:
1942                 result = osd_mkreg;
1943                 break;
1944         case DFT_SYM:
1945                 result = osd_mksym;
1946                 break;
1947         case DFT_NODE:
1948                 result = osd_mknod;
1949                 break;
1950         default:
1951                 LBUG();
1952                 break;
1953         }
1954         return result;
1955 }
1956
1957 /*
1958  * Concurrency: @dt is write locked.
1959  */
1960 static int osd_create(const struct lu_env *env, struct dt_object *dt,
1961                       struct lu_attr *attr, struct dt_allocation_hint *hint,
1962                       struct dt_object_format *dof, struct thandle *th)
1963 {
1964         struct osd_thread_info  *info = osd_oti_get(env);
1965         struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs;
1966         struct zpl_direntry     *zde = &info->oti_zde.lzd_reg;
1967         const struct lu_fid     *fid = lu_object_fid(&dt->do_lu);
1968         struct osd_object       *obj = osd_dt_obj(dt);
1969         struct osd_device       *osd = osd_obj2dev(obj);
1970         char                    *buf = info->oti_str;
1971         struct osd_thandle      *oh;
1972         dnode_t *dn = NULL, *zdn = NULL;
1973         uint64_t                 zapid, parent = 0;
1974         int                      rc;
1975         __u32 compat = 0;
1976
1977         ENTRY;
1978
1979         LASSERT(!fid_is_acct(fid));
1980
1981         /* concurrent create declarations should not see
1982          * the object inconsistent (db, attr, etc).
1983          * in regular cases acquisition should be cheap */
1984         down_write(&obj->oo_guard);
1985
1986         if (unlikely(dt_object_exists(dt)))
1987                 GOTO(out, rc = -EEXIST);
1988
1989         LASSERT(osd_invariant(obj));
1990         LASSERT(dof != NULL);
1991
1992         LASSERT(th != NULL);
1993         oh = container_of(th, struct osd_thandle, ot_super);
1994
1995         LASSERT(obj->oo_dn == NULL);
1996
1997         /* to follow ZFS on-disk format we need
1998          * to initialize parent dnode properly */
1999         if (hint != NULL && hint->dah_parent != NULL &&
2000             !dt_object_remote(hint->dah_parent))
2001                 parent = osd_dt_obj(hint->dah_parent)->oo_dn->dn_object;
2002
2003         /* we may fix some attributes, better do not change the source */
2004         obj->oo_attr = *attr;
2005         obj->oo_attr.la_size = 0;
2006         obj->oo_attr.la_nlink = 0;
2007         obj->oo_attr.la_valid |= LA_SIZE | LA_NLINK | LA_TYPE;
2008
2009 #ifdef ZFS_PROJINHERIT
2010         if (osd->od_projectused_dn) {
2011                 if (!(obj->oo_attr.la_valid & LA_PROJID))
2012                         obj->oo_attr.la_projid = ZFS_DEFAULT_PROJID;
2013                 obj->oo_with_projid = 1;
2014         }
2015 #endif
2016
2017         dn = osd_create_type_f(dof->dof_type)(env, obj, &obj->oo_attr, oh);
2018         if (IS_ERR(dn)) {
2019                 rc = PTR_ERR(dn);
2020                 dn = NULL;
2021                 GOTO(out, rc);
2022         }
2023
2024         zde->zde_pad = 0;
2025         zde->zde_dnode = dn->dn_object;
2026         zde->zde_type = S_DT(attr->la_mode & S_IFMT);
2027
2028         zapid = osd_get_name_n_idx(env, osd, fid, buf,
2029                                    sizeof(info->oti_str), &zdn);
2030         if (CFS_FAIL_CHECK(OBD_FAIL_OSD_NO_OI_ENTRY) ||
2031             (osd->od_is_ost && CFS_FAIL_CHECK(OBD_FAIL_OSD_COMPAT_NO_ENTRY)))
2032                 goto skip_add;
2033
2034         if (osd->od_is_ost && CFS_FAIL_CHECK(OBD_FAIL_OSD_COMPAT_INVALID_ENTRY))
2035                 zde->zde_dnode++;
2036
2037         rc = osd_zap_add(osd, zapid, zdn, buf, 8, 1, zde, oh->ot_tx);
2038         if (rc)
2039                 GOTO(out, rc);
2040
2041 skip_add:
2042         obj->oo_dn = dn;
2043         /* Now add in all of the "SA" attributes */
2044         rc = osd_sa_handle_get(obj);
2045         if (rc)
2046                 GOTO(out, rc);
2047
2048         rc = -nvlist_alloc(&obj->oo_sa_xattr, NV_UNIQUE_NAME, KM_SLEEP);
2049         if (rc)
2050                 GOTO(out, rc);
2051
2052         /* initialize LMA */
2053         if (fid_is_idif(fid) || (fid_is_norm(fid) && osd->od_is_ost))
2054                 compat |= LMAC_FID_ON_OST;
2055         lustre_lma_init(lma, fid, compat, 0);
2056         lustre_lma_swab(lma);
2057         rc = -nvlist_add_byte_array(obj->oo_sa_xattr, XATTR_NAME_LMA,
2058                                     (uchar_t *)lma, sizeof(*lma));
2059         if (rc)
2060                 GOTO(out, rc);
2061
2062         /* configure new osd object */
2063         obj->oo_parent = parent != 0 ? parent : zapid;
2064         obj->oo_late_attr_set = 1;
2065         rc = __osd_sa_xattr_schedule_update(env, obj, oh);
2066         if (rc)
2067                 GOTO(out, rc);
2068
2069         /* XXX: oo_lma_flags */
2070         obj->oo_dt.do_lu.lo_header->loh_attr |= obj->oo_attr.la_mode & S_IFMT;
2071         if (likely(!fid_is_acct(lu_object_fid(&obj->oo_dt.do_lu))))
2072                 /* no body operations for accounting objects */
2073                 obj->oo_dt.do_body_ops = &osd_body_ops;
2074
2075         osd_idc_find_and_init(env, osd, obj);
2076
2077 out:
2078         if (unlikely(rc && dn)) {
2079                 dmu_object_free(osd->od_os, dn->dn_object, oh->ot_tx);
2080                 osd_dnode_rele(dn);
2081                 obj->oo_dn = NULL;
2082         } else if (!rc) {
2083                 obj->oo_dt.do_lu.lo_header->loh_attr |= LOHA_EXISTS;
2084         }
2085         up_write(&obj->oo_guard);
2086         RETURN(rc);
2087 }
2088
2089 static int osd_declare_ref_add(const struct lu_env *env, struct dt_object *dt,
2090                                struct thandle *th)
2091 {
2092         osd_idc_find_and_init(env, osd_dev(dt->do_lu.lo_dev), osd_dt_obj(dt));
2093         return osd_declare_attr_set(env, dt, NULL, th);
2094 }
2095
2096 /*
2097  * Concurrency: @dt is write locked.
2098  */
2099 static int osd_ref_add(const struct lu_env *env, struct dt_object *dt,
2100                        struct thandle *handle)
2101 {
2102         struct osd_object       *obj = osd_dt_obj(dt);
2103         struct osd_thandle      *oh;
2104         struct osd_device       *osd = osd_obj2dev(obj);
2105         uint64_t                 nlink;
2106         int rc;
2107
2108         ENTRY;
2109
2110         down_read(&obj->oo_guard);
2111         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
2112                 GOTO(out, rc = -ENOENT);
2113
2114         LASSERT(osd_invariant(obj));
2115         LASSERT(obj->oo_sa_hdl != NULL);
2116
2117         oh = container_of(handle, struct osd_thandle, ot_super);
2118
2119         write_lock(&obj->oo_attr_lock);
2120         nlink = ++obj->oo_attr.la_nlink;
2121         write_unlock(&obj->oo_attr_lock);
2122
2123         rc = osd_object_sa_update(obj, SA_ZPL_LINKS(osd), &nlink, 8, oh);
2124
2125 out:
2126         up_read(&obj->oo_guard);
2127         RETURN(rc);
2128 }
2129
2130 static int osd_declare_ref_del(const struct lu_env *env, struct dt_object *dt,
2131                                struct thandle *handle)
2132 {
2133         osd_idc_find_and_init(env, osd_dev(dt->do_lu.lo_dev), osd_dt_obj(dt));
2134         return osd_declare_attr_set(env, dt, NULL, handle);
2135 }
2136
2137 /*
2138  * Concurrency: @dt is write locked.
2139  */
2140 static int osd_ref_del(const struct lu_env *env, struct dt_object *dt,
2141                        struct thandle *handle)
2142 {
2143         struct osd_object       *obj = osd_dt_obj(dt);
2144         struct osd_thandle      *oh;
2145         struct osd_device       *osd = osd_obj2dev(obj);
2146         uint64_t                 nlink;
2147         int                      rc;
2148
2149         ENTRY;
2150
2151         down_read(&obj->oo_guard);
2152
2153         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
2154                 GOTO(out, rc = -ENOENT);
2155
2156         LASSERT(osd_invariant(obj));
2157         LASSERT(obj->oo_sa_hdl != NULL);
2158
2159         oh = container_of(handle, struct osd_thandle, ot_super);
2160         LASSERT(!lu_object_is_dying(dt->do_lu.lo_header));
2161
2162         write_lock(&obj->oo_attr_lock);
2163         nlink = --obj->oo_attr.la_nlink;
2164         write_unlock(&obj->oo_attr_lock);
2165
2166         rc = osd_object_sa_update(obj, SA_ZPL_LINKS(osd), &nlink, 8, oh);
2167
2168 out:
2169         up_read(&obj->oo_guard);
2170         RETURN(rc);
2171 }
2172
2173 static int osd_object_sync(const struct lu_env *env, struct dt_object *dt,
2174                            __u64 start, __u64 end)
2175 {
2176         struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
2177         uint64_t txg = 0;
2178         ENTRY;
2179
2180         if (osd->od_dt_dev.dd_rdonly)
2181                 RETURN(0);
2182
2183         txg = osd_db_dirty_txg(osd_dt_obj(dt)->oo_dn->dn_dbuf);
2184         if (txg) {
2185                 /* the object is dirty or being synced */
2186                 if (osd_object_sync_delay_us < 0)
2187                         txg_wait_synced(dmu_objset_pool(osd->od_os), txg);
2188                 else
2189                         udelay(osd_object_sync_delay_us);
2190         }
2191
2192         RETURN(0);
2193 }
2194
2195 static int osd_invalidate(const struct lu_env *env, struct dt_object *dt)
2196 {
2197         return 0;
2198 }
2199
2200 static bool osd_check_stale(struct dt_object *dt)
2201 {
2202         return false;
2203 }
2204
2205 static const struct dt_object_operations osd_obj_ops = {
2206         .do_read_lock           = osd_read_lock,
2207         .do_write_lock          = osd_write_lock,
2208         .do_read_unlock         = osd_read_unlock,
2209         .do_write_unlock        = osd_write_unlock,
2210         .do_write_locked        = osd_write_locked,
2211         .do_attr_get            = osd_attr_get,
2212         .do_declare_attr_set    = osd_declare_attr_set,
2213         .do_attr_set            = osd_attr_set,
2214         .do_ah_init             = osd_ah_init,
2215         .do_declare_create      = osd_declare_create,
2216         .do_create              = osd_create,
2217         .do_declare_destroy     = osd_declare_destroy,
2218         .do_destroy             = osd_destroy,
2219         .do_index_try           = osd_index_try,
2220         .do_declare_ref_add     = osd_declare_ref_add,
2221         .do_ref_add             = osd_ref_add,
2222         .do_declare_ref_del     = osd_declare_ref_del,
2223         .do_ref_del             = osd_ref_del,
2224         .do_xattr_get           = osd_xattr_get,
2225         .do_declare_xattr_set   = osd_declare_xattr_set,
2226         .do_xattr_set           = osd_xattr_set,
2227         .do_declare_xattr_del   = osd_declare_xattr_del,
2228         .do_xattr_del           = osd_xattr_del,
2229         .do_xattr_list          = osd_xattr_list,
2230         .do_object_sync         = osd_object_sync,
2231         .do_invalidate          = osd_invalidate,
2232         .do_check_stale         = osd_check_stale,
2233 };
2234
2235 static const struct lu_object_operations osd_lu_obj_ops = {
2236         .loo_object_init        = osd_object_init,
2237         .loo_object_delete      = osd_object_delete,
2238         .loo_object_release     = osd_object_release,
2239         .loo_object_free        = osd_object_free,
2240         .loo_object_print       = osd_object_print,
2241         .loo_object_invariant   = osd_object_invariant,
2242 };
2243
2244 static int osd_otable_it_attr_get(const struct lu_env *env,
2245                                 struct dt_object *dt,
2246                                 struct lu_attr *attr)
2247 {
2248         attr->la_valid = 0;
2249         return 0;
2250 }
2251
2252 static const struct dt_object_operations osd_obj_otable_it_ops = {
2253         .do_attr_get            = osd_otable_it_attr_get,
2254         .do_index_try           = osd_index_try,
2255 };
2256
2257 module_param(osd_object_sync_delay_us, int, 0644);
2258 MODULE_PARM_DESC(osd_object_sync_delay_us,
2259                  "If zero or larger delay N usec instead of doing object sync");