lustre/osd-zfs/osd_object.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2012, 2016, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/osd-zfs/osd_object.c
  33  *
  34  * Author: Alex Zhuravlev <bzzz@whamcloud.com>
  35  * Author: Mike Pershin <tappro@whamcloud.com>
  36  * Author: Johann Lombardi <johann@whamcloud.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_OSD
  40
  41 #include <lustre_ver.h>
  42 #include <libcfs/libcfs.h>
  43 #include <obd_support.h>
  44 #include <lustre_net.h>
  45 #include <obd.h>
  46 #include <obd_class.h>
  47 #include <lustre_disk.h>
  48 #include <lustre_fid.h>
  49
  50 #include "osd_internal.h"
  51
  52 #include <sys/dnode.h>
  53 #include <sys/dbuf.h>
  54 #include <sys/spa.h>
  55 #include <sys/stat.h>
  56 #include <sys/zap.h>
  57 #include <sys/spa_impl.h>
  58 #include <sys/zfs_znode.h>
  59 #include <sys/dmu_tx.h>
  60 #include <sys/dmu_objset.h>
  61 #include <sys/dsl_prop.h>
  62 #include <sys/sa_impl.h>
  63 #include <sys/txg.h>
  64
  65 char *osd_obj_tag = "osd_object";
  66 static int osd_object_sync_delay_us = -1;
  67
  68 static struct dt_object_operations osd_obj_ops;
  69 static struct lu_object_operations osd_lu_obj_ops;
  70 extern struct dt_body_operations osd_body_ops;
  71 static struct dt_object_operations osd_obj_otable_it_ops;
  72
  73 extern struct kmem_cache *osd_object_kmem;
  74
  75 static void
  76 osd_object_sa_fini(struct osd_object *obj)
  77 {
  78         if (obj->oo_sa_hdl) {
  79                 sa_handle_destroy(obj->oo_sa_hdl);
  80                 obj->oo_sa_hdl = NULL;
  81         }
  82 }
  83
  84 static int
  85 osd_object_sa_init(struct osd_object *obj, struct osd_device *o)
  86 {
  87         int rc;
  88
  89         LASSERT(obj->oo_sa_hdl == NULL);
  90         LASSERT(obj->oo_dn != NULL);
  91
  92         rc = osd_sa_handle_get(obj);
  93         if (rc)
  94                 return rc;
  95
  96         /* Cache the xattr object id, valid for the life of the object */
  97         rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_XATTR(o), &obj->oo_xattr, 8);
  98         if (rc == -ENOENT) {
  99                 obj->oo_xattr = ZFS_NO_OBJECT;
 100                 rc = 0;
 101         } else if (rc) {
 102                 osd_object_sa_fini(obj);
 103         }
 104
 105         return rc;
 106 }
 107
 108 /*
 109  * Add object to list of dirty objects in tx handle.
 110  */
 111 void osd_object_sa_dirty_add(struct osd_object *obj, struct osd_thandle *oh)
 112 {
 113         if (!list_empty(&obj->oo_sa_linkage))
 114                 return;
 115
 116         write_lock(&obj->oo_attr_lock);
 117         if (likely(list_empty(&obj->oo_sa_linkage)))
 118                 list_add(&obj->oo_sa_linkage, &oh->ot_sa_list);
 119         write_unlock(&obj->oo_attr_lock);
 120 }
 121
 122 /*
 123  * Release spill block dbuf hold for all dirty SAs.
 124  */
 125 void osd_object_sa_dirty_rele(const struct lu_env *env, struct osd_thandle *oh)
 126 {
 127         struct osd_object *obj;
 128
 129         while (!list_empty(&oh->ot_sa_list)) {
 130                 obj = list_entry(oh->ot_sa_list.next,
 131                                  struct osd_object, oo_sa_linkage);
 132                 write_lock(&obj->oo_attr_lock);
 133                 list_del_init(&obj->oo_sa_linkage);
 134                 write_unlock(&obj->oo_attr_lock);
 135                 if (obj->oo_late_xattr) {
 136                         /*
 137                          * take oo_guard to protect oo_sa_xattr buffer
 138                          * from concurrent update by osd_xattr_set()
 139                          */
 140                         LASSERT(oh->ot_assigned != 0);
 141                         down_write(&obj->oo_guard);
 142                         if (obj->oo_late_attr_set)
 143                                 __osd_sa_attr_init(env, obj, oh);
 144                         else if (obj->oo_late_xattr)
 145                                 __osd_sa_xattr_update(env, obj, oh);
 146                         up_write(&obj->oo_guard);
 147                 }
 148                 sa_spill_rele(obj->oo_sa_hdl);
 149         }
 150 }
 151
 152 /*
 153  * Update the SA and add the object to the dirty list.
 154  */
 155 int osd_object_sa_update(struct osd_object *obj, sa_attr_type_t type,
 156                          void *buf, uint32_t buflen, struct osd_thandle *oh)
 157 {
 158         int rc;
 159
 160         LASSERT(obj->oo_sa_hdl != NULL);
 161         LASSERT(oh->ot_tx != NULL);
 162
 163         rc = -sa_update(obj->oo_sa_hdl, type, buf, buflen, oh->ot_tx);
 164         osd_object_sa_dirty_add(obj, oh);
 165
 166         return rc;
 167 }
 168
 169 /*
 170  * Bulk update the SA and add the object to the dirty list.
 171  */
 172 static int
 173 osd_object_sa_bulk_update(struct osd_object *obj, sa_bulk_attr_t *attrs,
 174                           int count, struct osd_thandle *oh)
 175 {
 176         int rc;
 177
 178         LASSERT(obj->oo_sa_hdl != NULL);
 179         LASSERT(oh->ot_tx != NULL);
 180
 181         rc = -sa_bulk_update(obj->oo_sa_hdl, attrs, count, oh->ot_tx);
 182         osd_object_sa_dirty_add(obj, oh);
 183
 184         return rc;
 185 }
 186
 187 /*
 188  * Retrieve the attributes of a DMU object
 189  */
 190 static int __osd_object_attr_get(const struct lu_env *env, struct osd_device *o,
 191                                  struct osd_object *obj, struct lu_attr *la)
 192 {
 193         struct osa_attr *osa = &osd_oti_get(env)->oti_osa;
 194         sa_bulk_attr_t  *bulk = osd_oti_get(env)->oti_attr_bulk;
 195         int              cnt = 0;
 196         int              rc;
 197         ENTRY;
 198
 199         LASSERT(obj->oo_dn != NULL);
 200
 201         la->la_valid |= LA_ATIME | LA_MTIME | LA_CTIME | LA_MODE | LA_TYPE |
 202                         LA_SIZE | LA_UID | LA_GID | LA_FLAGS | LA_NLINK;
 203
 204         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(o), NULL, osa->atime, 16);
 205         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(o), NULL, osa->mtime, 16);
 206         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(o), NULL, osa->ctime, 16);
 207         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(o), NULL, &osa->mode, 8);
 208         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(o), NULL, &osa->size, 8);
 209         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(o), NULL, &osa->nlink, 8);
 210         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(o), NULL, &osa->uid, 8);
 211         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(o), NULL, &osa->gid, 8);
 212         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(o), NULL, &osa->flags, 8);
 213         LASSERT(cnt <= ARRAY_SIZE(osd_oti_get(env)->oti_attr_bulk));
 214
 215         rc = -sa_bulk_lookup(obj->oo_sa_hdl, bulk, cnt);
 216         if (rc)
 217                 GOTO(out_sa, rc);
 218
 219 #ifdef ZFS_PROJINHERIT
 220         if (o->od_projectused_dn && osa->flags & ZFS_PROJID) {
 221                 rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_PROJID(o),
 222                                 &osa->projid, 8);
 223                 if (rc)
 224                         GOTO(out_sa, rc);
 225
 226                 la->la_projid = osa->projid;
 227                 la->la_valid |= LA_PROJID;
 228                 obj->oo_with_projid = 1;
 229         } else {
 230                 la->la_projid = ZFS_DEFAULT_PROJID;
 231                 la->la_valid &= ~LA_PROJID;
 232         }
 233 #else
 234         la->la_projid = 0;
 235         la->la_valid &= ~LA_PROJID;
 236 #endif
 237
 238         la->la_atime = osa->atime[0];
 239         la->la_mtime = osa->mtime[0];
 240         la->la_ctime = osa->ctime[0];
 241         la->la_mode = osa->mode;
 242         la->la_uid = osa->uid;
 243         la->la_gid = osa->gid;
 244         la->la_nlink = osa->nlink;
 245         la->la_flags = attrs_zfs2fs(osa->flags);
 246         la->la_size = osa->size;
 247
 248         /* Try to get extra flag from LMA. Right now, only LMAI_ORPHAN
 249          * flags is stored in LMA, and it is only for orphan directory */
 250         if (S_ISDIR(la->la_mode) && dt_object_exists(&obj->oo_dt)) {
 251                 struct osd_thread_info *info = osd_oti_get(env);
 252                 struct lustre_mdt_attrs *lma;
 253                 struct lu_buf buf;
 254
 255                 lma = (struct lustre_mdt_attrs *)info->oti_buf;
 256                 buf.lb_buf = lma;
 257                 buf.lb_len = sizeof(info->oti_buf);
 258                 rc = osd_xattr_get(env, &obj->oo_dt, &buf, XATTR_NAME_LMA);
 259                 if (rc > 0) {
 260                         rc = 0;
 261                         lma->lma_incompat = le32_to_cpu(lma->lma_incompat);
 262                         obj->oo_lma_flags =
 263                                 lma_to_lustre_flags(lma->lma_incompat);
 264
 265                 } else if (rc == -ENODATA) {
 266                         rc = 0;
 267                 }
 268         }
 269
 270         if (S_ISCHR(la->la_mode) || S_ISBLK(la->la_mode)) {
 271                 rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_RDEV(o), &osa->rdev, 8);
 272                 if (rc)
 273                         GOTO(out_sa, rc);
 274                 la->la_rdev = osa->rdev;
 275                 la->la_valid |= LA_RDEV;
 276         }
 277 out_sa:
 278
 279         RETURN(rc);
 280 }
 281
 282 int __osd_obj2dnode(objset_t *os, uint64_t oid, dnode_t **dnp)
 283 {
 284         dmu_buf_t *db;
 285         dmu_buf_impl_t *dbi;
 286         int rc;
 287
 288         rc = -dmu_bonus_hold(os, oid, osd_obj_tag, &db);
 289         if (rc)
 290                 return rc;
 291
 292         dbi = (dmu_buf_impl_t *)db;
 293         DB_DNODE_ENTER(dbi);
 294         *dnp = DB_DNODE(dbi);
 295         LASSERT(*dnp != NULL);
 296
 297         return 0;
 298 }
 299
 300 /*
 301  * Concurrency: no concurrent access is possible that early in object
 302  * life-cycle.
 303  */
 304 struct lu_object *osd_object_alloc(const struct lu_env *env,
 305                                    const struct lu_object_header *hdr,
 306                                    struct lu_device *d)
 307 {
 308         struct osd_object *mo;
 309
 310         OBD_SLAB_ALLOC_PTR_GFP(mo, osd_object_kmem, GFP_NOFS);
 311         if (mo != NULL) {
 312                 struct lu_object *l;
 313
 314                 l = &mo->oo_dt.do_lu;
 315                 dt_object_init(&mo->oo_dt, NULL, d);
 316                 mo->oo_dt.do_ops = &osd_obj_ops;
 317                 l->lo_ops = &osd_lu_obj_ops;
 318                 INIT_LIST_HEAD(&mo->oo_sa_linkage);
 319                 INIT_LIST_HEAD(&mo->oo_unlinked_linkage);
 320                 init_rwsem(&mo->oo_sem);
 321                 init_rwsem(&mo->oo_guard);
 322                 rwlock_init(&mo->oo_attr_lock);
 323                 mo->oo_destroy = OSD_DESTROY_NONE;
 324                 return l;
 325         } else {
 326                 return NULL;
 327         }
 328 }
 329
 330 /*
 331  * Concurrency: shouldn't matter.
 332  */
 333 static int osd_object_init0(const struct lu_env *env, struct osd_object *obj)
 334 {
 335         struct osd_device       *osd = osd_obj2dev(obj);
 336         const struct lu_fid     *fid = lu_object_fid(&obj->oo_dt.do_lu);
 337         int                      rc = 0;
 338         ENTRY;
 339
 340         if (obj->oo_dn == NULL)
 341                 RETURN(0);
 342
 343         /* object exist */
 344
 345         rc = osd_object_sa_init(obj, osd);
 346         if (rc)
 347                 RETURN(rc);
 348
 349         /* cache attrs in object */
 350         rc = __osd_object_attr_get(env, osd, obj, &obj->oo_attr);
 351         if (rc)
 352                 RETURN(rc);
 353
 354         if (likely(!fid_is_acct(fid)))
 355                 /* no body operations for accounting objects */
 356                 obj->oo_dt.do_body_ops = &osd_body_ops;
 357
 358         /*
 359          * initialize object before marking it existing
 360          */
 361         obj->oo_dt.do_lu.lo_header->loh_attr |= obj->oo_attr.la_mode & S_IFMT;
 362
 363         smp_mb();
 364         obj->oo_dt.do_lu.lo_header->loh_attr |= LOHA_EXISTS;
 365
 366         RETURN(0);
 367 }
 368
 369 static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
 370 {
 371         struct osd_thread_info  *info = osd_oti_get(env);
 372         struct lu_buf           buf;
 373         int                     rc;
 374         struct lustre_mdt_attrs *lma;
 375         ENTRY;
 376
 377         CLASSERT(sizeof(info->oti_buf) >= sizeof(*lma));
 378         lma = (struct lustre_mdt_attrs *)info->oti_buf;
 379         buf.lb_buf = lma;
 380         buf.lb_len = sizeof(info->oti_buf);
 381
 382         rc = osd_xattr_get(env, &obj->oo_dt, &buf, XATTR_NAME_LMA);
 383         if (rc > 0) {
 384                 rc = 0;
 385                 lustre_lma_swab(lma);
 386                 if (unlikely((lma->lma_incompat & ~LMA_INCOMPAT_SUPP) ||
 387                              CFS_FAIL_CHECK(OBD_FAIL_OSD_LMA_INCOMPAT))) {
 388                         CWARN("%s: unsupported incompat LMA feature(s) %#x for "
 389                               "fid = "DFID"\n", osd_obj2dev(obj)->od_svname,
 390                               lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
 391                               PFID(lu_object_fid(&obj->oo_dt.do_lu)));
 392                         rc = -EOPNOTSUPP;
 393                 }
 394         } else if (rc == -ENODATA) {
 395                 /* haven't initialize LMA xattr */
 396                 rc = 0;
 397         }
 398
 399         RETURN(rc);
 400 }
 401
 402 /**
 403  * Helper function to retrieve DMU object id from fid for accounting object
 404  */
 405 static dnode_t *osd_quota_fid2dmu(const struct osd_device *osd,
 406                                   const struct lu_fid *fid)
 407 {
 408         dnode_t *dn = NULL;
 409
 410         LASSERT(fid_is_acct(fid));
 411
 412         switch (fid_oid(fid)) {
 413         case ACCT_USER_OID:
 414                 dn = osd->od_userused_dn;
 415                 break;
 416         case ACCT_GROUP_OID:
 417                 dn = osd->od_groupused_dn;
 418                 break;
 419 #ifdef ZFS_PROJINHERIT
 420         case ACCT_PROJECT_OID:
 421                 dn = osd->od_projectused_dn;
 422                 break;
 423 #endif
 424         default:
 425                 break;
 426         }
 427
 428         return dn;
 429 }
 430
 431 /*
 432  * Concurrency: no concurrent access is possible that early in object
 433  * life-cycle.
 434  */
 435 static int osd_object_init(const struct lu_env *env, struct lu_object *l,
 436                            const struct lu_object_conf *conf)
 437 {
 438         struct osd_object *obj = osd_obj(l);
 439         struct osd_device *osd = osd_obj2dev(obj);
 440         const struct lu_fid *fid = lu_object_fid(l);
 441         uint64_t oid;
 442         int rc = 0;
 443         ENTRY;
 444
 445         LASSERT(osd_invariant(obj));
 446
 447         if (fid_is_otable_it(&l->lo_header->loh_fid)) {
 448                 obj->oo_dt.do_ops = &osd_obj_otable_it_ops;
 449                 l->lo_header->loh_attr |= LOHA_EXISTS;
 450                 RETURN(0);
 451         }
 452
 453         if (conf != NULL && conf->loc_flags & LOC_F_NEW)
 454                 GOTO(out, rc = 0);
 455
 456         if (unlikely(fid_is_acct(fid))) {
 457                 obj->oo_dn = osd_quota_fid2dmu(osd, fid);
 458                 if (obj->oo_dn) {
 459                         obj->oo_dt.do_index_ops = &osd_acct_index_ops;
 460                         l->lo_header->loh_attr |= LOHA_EXISTS;
 461                 }
 462
 463                 GOTO(out, rc = 0);
 464         }
 465
 466         rc = osd_fid_lookup(env, osd, fid, &oid);
 467         if (rc == 0) {
 468                 LASSERT(obj->oo_dn == NULL);
 469                 rc = __osd_obj2dnode(osd->od_os, oid, &obj->oo_dn);
 470                 /* EEXIST will be returned if object is being deleted in ZFS */
 471                 if (rc == -EEXIST) {
 472                         rc = 0;
 473                         GOTO(out, rc);
 474                 }
 475                 if (rc != 0) {
 476                         CERROR("%s: lookup "DFID"/%#llx failed: rc = %d\n",
 477                                osd->od_svname, PFID(lu_object_fid(l)), oid, rc);
 478                         GOTO(out, rc);
 479                 }
 480                 LASSERT(obj->oo_dn);
 481                 rc = osd_object_init0(env, obj);
 482                 if (rc != 0)
 483                         GOTO(out, rc);
 484
 485                 rc = osd_check_lma(env, obj);
 486                 if (rc != 0)
 487                         GOTO(out, rc);
 488         } else if (rc == -ENOENT) {
 489                 rc = 0;
 490         }
 491         LASSERT(osd_invariant(obj));
 492 out:
 493         RETURN(rc);
 494 }
 495
 496 /*
 497  * Concurrency: no concurrent access is possible that late in object
 498  * life-cycle.
 499  */
 500 static void osd_object_free(const struct lu_env *env, struct lu_object *l)
 501 {
 502         struct osd_object *obj = osd_obj(l);
 503
 504         LASSERT(osd_invariant(obj));
 505
 506         dt_object_fini(&obj->oo_dt);
 507         OBD_SLAB_FREE_PTR(obj, osd_object_kmem);
 508 }
 509
 510 static int
 511 osd_object_unlinked_add(struct osd_object *obj, struct osd_thandle *oh)
 512 {
 513         int rc = -EBUSY;
 514
 515         LASSERT(obj->oo_destroy == OSD_DESTROY_ASYNC);
 516
 517         /* the object is supposed to be exclusively locked by
 518          * the caller (osd_destroy()), while the transaction
 519          * (oh) is per-thread and not shared */
 520         if (likely(list_empty(&obj->oo_unlinked_linkage))) {
 521                 list_add(&obj->oo_unlinked_linkage, &oh->ot_unlinked_list);
 522                 rc = 0;
 523         }
 524
 525         return rc;
 526 }
 527
 528 /* Default to max data size covered by a level-1 indirect block */
 529 static unsigned long osd_sync_destroy_max_size =
 530         1UL << (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT + SPA_MAXBLOCKSHIFT);
 531 module_param(osd_sync_destroy_max_size, ulong, 0444);
 532 MODULE_PARM_DESC(osd_sync_destroy_max_size, "Maximum object size to use synchronous destroy.");
 533
 534 static inline void
 535 osd_object_set_destroy_type(struct osd_object *obj)
 536 {
 537         /*
 538          * Lock-less OST_WRITE can race with OST_DESTROY, so set destroy type
 539          * only once and use it consistently thereafter.
 540          */
 541         down_write(&obj->oo_guard);
 542         if (obj->oo_destroy == OSD_DESTROY_NONE) {
 543                 if (obj->oo_attr.la_size <= osd_sync_destroy_max_size)
 544                         obj->oo_destroy = OSD_DESTROY_SYNC;
 545                 else /* Larger objects are destroyed asynchronously */
 546                         obj->oo_destroy = OSD_DESTROY_ASYNC;
 547         }
 548         up_write(&obj->oo_guard);
 549 }
 550
 551 static int osd_declare_destroy(const struct lu_env *env, struct dt_object *dt,
 552                                struct thandle *th)
 553 {
 554         const struct lu_fid     *fid = lu_object_fid(&dt->do_lu);
 555         struct osd_object       *obj = osd_dt_obj(dt);
 556         struct osd_device       *osd = osd_obj2dev(obj);
 557         struct osd_thandle      *oh;
 558         dnode_t *dn;
 559         int                      rc;
 560         uint64_t                 zapid;
 561         ENTRY;
 562
 563         LASSERT(th != NULL);
 564         LASSERT(dt_object_exists(dt));
 565
 566         oh = container_of0(th, struct osd_thandle, ot_super);
 567         LASSERT(oh->ot_tx != NULL);
 568
 569         dmu_tx_mark_netfree(oh->ot_tx);
 570
 571         /* declare that we'll remove object from fid-dnode mapping */
 572         zapid = osd_get_name_n_idx(env, osd, fid, NULL, 0, &dn);
 573         osd_tx_hold_zap(oh->ot_tx, zapid, dn, FALSE, NULL);
 574
 575         osd_declare_xattrs_destroy(env, obj, oh);
 576
 577         /* one less inode */
 578         rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
 579                                obj->oo_attr.la_gid, obj->oo_attr.la_projid,
 580                                -1, oh, NULL, OSD_QID_INODE);
 581         if (rc)
 582                 RETURN(rc);
 583
 584         /* data to be truncated */
 585         rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
 586                                obj->oo_attr.la_gid, obj->oo_attr.la_projid,
 587                                0, oh, NULL, OSD_QID_BLK);
 588         if (rc)
 589                 RETURN(rc);
 590
 591         osd_object_set_destroy_type(obj);
 592         if (obj->oo_destroy == OSD_DESTROY_SYNC)
 593                 dmu_tx_hold_free(oh->ot_tx, obj->oo_dn->dn_object,
 594                                  0, DMU_OBJECT_END);
 595         else
 596                 osd_tx_hold_zap(oh->ot_tx, osd->od_unlinked->dn_object,
 597                                 osd->od_unlinked, TRUE, NULL);
 598
 599         /* will help to find FID->ino when this object is being
 600          * added to PENDING/ */
 601         osd_idc_find_and_init(env, osd, obj);
 602
 603         RETURN(0);
 604 }
 605
 606 static int osd_destroy(const struct lu_env *env, struct dt_object *dt,
 607                        struct thandle *th)
 608 {
 609         struct osd_thread_info  *info = osd_oti_get(env);
 610         char                    *buf = info->oti_str;
 611         struct osd_object       *obj = osd_dt_obj(dt);
 612         struct osd_device       *osd = osd_obj2dev(obj);
 613         const struct lu_fid     *fid = lu_object_fid(&dt->do_lu);
 614         struct osd_thandle      *oh;
 615         int                      rc;
 616         uint64_t                 oid, zapid;
 617         dnode_t *zdn;
 618         ENTRY;
 619
 620         down_write(&obj->oo_guard);
 621
 622         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
 623                 GOTO(out, rc = -ENOENT);
 624
 625         LASSERT(obj->oo_dn != NULL);
 626
 627         oh = container_of0(th, struct osd_thandle, ot_super);
 628         LASSERT(oh != NULL);
 629         LASSERT(oh->ot_tx != NULL);
 630
 631         /* remove obj ref from index dir (it depends) */
 632         zapid = osd_get_name_n_idx(env, osd, fid, buf,
 633                                    sizeof(info->oti_str), &zdn);
 634         rc = osd_zap_remove(osd, zapid, zdn, buf, oh->ot_tx);
 635         if (rc) {
 636                 CERROR("%s: zap_remove(%s) failed: rc = %d\n",
 637                        osd->od_svname, buf, rc);
 638                 GOTO(out, rc);
 639         }
 640
 641         rc = osd_xattrs_destroy(env, obj, oh);
 642         if (rc) {
 643                 CERROR("%s: cannot destroy xattrs for %s: rc = %d\n",
 644                        osd->od_svname, buf, rc);
 645                 GOTO(out, rc);
 646         }
 647
 648         oid = obj->oo_dn->dn_object;
 649         if (unlikely(obj->oo_destroy == OSD_DESTROY_NONE)) {
 650                 /* this may happen if the destroy wasn't declared
 651                  * e.g. when the object is created and then destroyed
 652                  * in the same transaction - we don't need additional
 653                  * space for destroy specifically */
 654                 LASSERT(obj->oo_attr.la_size <= osd_sync_destroy_max_size);
 655                 rc = -dmu_object_free(osd->od_os, oid, oh->ot_tx);
 656                 if (rc)
 657                         CERROR("%s: failed to free %s %llu: rc = %d\n",
 658                                osd->od_svname, buf, oid, rc);
 659         } else if (obj->oo_destroy == OSD_DESTROY_SYNC) {
 660                 rc = -dmu_object_free(osd->od_os, oid, oh->ot_tx);
 661                 if (rc)
 662                         CERROR("%s: failed to free %s %llu: rc = %d\n",
 663                                osd->od_svname, buf, oid, rc);
 664         } else { /* asynchronous destroy */
 665                 char *key = info->oti_key;
 666
 667                 rc = osd_object_unlinked_add(obj, oh);
 668                 if (rc)
 669                         GOTO(out, rc);
 670
 671                 snprintf(key, sizeof(info->oti_key), "%llx", oid);
 672                 rc = osd_zap_add(osd, osd->od_unlinked->dn_object,
 673                                  osd->od_unlinked, key, 8, 1, &oid, oh->ot_tx);
 674                 if (rc)
 675                         CERROR("%s: zap_add_int() failed %s %llu: rc = %d\n",
 676                                osd->od_svname, buf, oid, rc);
 677         }
 678
 679 out:
 680         /* not needed in the cache anymore */
 681         set_bit(LU_OBJECT_HEARD_BANSHEE, &dt->do_lu.lo_header->loh_flags);
 682         if (rc == 0)
 683                 obj->oo_destroyed = 1;
 684         up_write(&obj->oo_guard);
 685         RETURN (0);
 686 }
 687
 688 static void osd_object_delete(const struct lu_env *env, struct lu_object *l)
 689 {
 690         struct osd_object *obj = osd_obj(l);
 691         const struct lu_fid *fid = lu_object_fid(l);
 692
 693         if (obj->oo_dn) {
 694                 if (likely(!fid_is_acct(fid))) {
 695                         osd_object_sa_fini(obj);
 696                         if (obj->oo_sa_xattr) {
 697                                 nvlist_free(obj->oo_sa_xattr);
 698                                 obj->oo_sa_xattr = NULL;
 699                         }
 700                         osd_dnode_rele(obj->oo_dn);
 701                         list_del(&obj->oo_sa_linkage);
 702                 }
 703                 obj->oo_dn = NULL;
 704         }
 705 }
 706
 707 /*
 708  * Concurrency: ->loo_object_release() is called under site spin-lock.
 709  */
 710 static void osd_object_release(const struct lu_env *env,
 711                                struct lu_object *l)
 712 {
 713 }
 714
 715 /*
 716  * Concurrency: shouldn't matter.
 717  */
 718 static int osd_object_print(const struct lu_env *env, void *cookie,
 719                             lu_printer_t p, const struct lu_object *l)
 720 {
 721         struct osd_object *o = osd_obj(l);
 722
 723         return (*p)(env, cookie, LUSTRE_OSD_ZFS_NAME"-object@%p", o);
 724 }
 725
 726 static void osd_read_lock(const struct lu_env *env, struct dt_object *dt,
 727                           unsigned role)
 728 {
 729         struct osd_object *obj = osd_dt_obj(dt);
 730
 731         LASSERT(osd_invariant(obj));
 732
 733         down_read_nested(&obj->oo_sem, role);
 734 }
 735
 736 static void osd_write_lock(const struct lu_env *env, struct dt_object *dt,
 737                            unsigned role)
 738 {
 739         struct osd_object *obj = osd_dt_obj(dt);
 740
 741         LASSERT(osd_invariant(obj));
 742
 743         down_write_nested(&obj->oo_sem, role);
 744 }
 745
 746 static void osd_read_unlock(const struct lu_env *env, struct dt_object *dt)
 747 {
 748         struct osd_object *obj = osd_dt_obj(dt);
 749
 750         LASSERT(osd_invariant(obj));
 751         up_read(&obj->oo_sem);
 752 }
 753
 754 static void osd_write_unlock(const struct lu_env *env, struct dt_object *dt)
 755 {
 756         struct osd_object *obj = osd_dt_obj(dt);
 757
 758         LASSERT(osd_invariant(obj));
 759         up_write(&obj->oo_sem);
 760 }
 761
 762 static int osd_write_locked(const struct lu_env *env, struct dt_object *dt)
 763 {
 764         struct osd_object *obj = osd_dt_obj(dt);
 765         int rc = 1;
 766
 767         LASSERT(osd_invariant(obj));
 768
 769         if (down_write_trylock(&obj->oo_sem)) {
 770                 rc = 0;
 771                 up_write(&obj->oo_sem);
 772         }
 773         return rc;
 774 }
 775
 776 static int osd_attr_get(const struct lu_env *env,
 777                         struct dt_object *dt,
 778                         struct lu_attr *attr)
 779 {
 780         struct osd_object       *obj = osd_dt_obj(dt);
 781         uint64_t                 blocks;
 782         uint32_t                 blksize;
 783         int                      rc = 0;
 784
 785         down_read(&obj->oo_guard);
 786
 787         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
 788                 GOTO(out, rc = -ENOENT);
 789
 790         if (unlikely(fid_is_acct(lu_object_fid(&dt->do_lu))))
 791                 GOTO(out, rc = 0);
 792
 793         LASSERT(osd_invariant(obj));
 794         LASSERT(obj->oo_dn);
 795
 796         read_lock(&obj->oo_attr_lock);
 797         *attr = obj->oo_attr;
 798         if (obj->oo_lma_flags & LUSTRE_ORPHAN_FL)
 799                 attr->la_flags |= LUSTRE_ORPHAN_FL;
 800         read_unlock(&obj->oo_attr_lock);
 801
 802         /* with ZFS_DEBUG zrl_add_debug() called by DB_DNODE_ENTER()
 803          * from within sa_object_size() can block on a mutex, so
 804          * we can't call sa_object_size() holding rwlock */
 805         sa_object_size(obj->oo_sa_hdl, &blksize, &blocks);
 806         /* we do not control size of indices, so always calculate
 807          * it from number of blocks reported by DMU */
 808         if (S_ISDIR(attr->la_mode))
 809                 attr->la_size = 512 * blocks;
 810         /* Block size may be not set; suggest maximal I/O transfers. */
 811         if (blksize == 0)
 812                 blksize = osd_spa_maxblocksize(
 813                         dmu_objset_spa(osd_obj2dev(obj)->od_os));
 814
 815         attr->la_blksize = blksize;
 816         attr->la_blocks = blocks;
 817         attr->la_valid |= LA_BLOCKS | LA_BLKSIZE;
 818
 819 out:
 820         up_read(&obj->oo_guard);
 821         return rc;
 822 }
 823
 824 /* Simple wrapper on top of qsd API which implement quota transfer for osd
 825  * setattr needs. As a reminder, only the root user can change ownership of
 826  * a file, that's why EDQUOT & EINPROGRESS errors are discarded */
 827 static inline int qsd_transfer(const struct lu_env *env,
 828                                struct qsd_instance *qsd,
 829                                struct lquota_trans *trans, int qtype,
 830                                __u64 orig_id, __u64 new_id, __u64 bspace,
 831                                struct lquota_id_info *qi, bool ignore_edquot)
 832 {
 833         int     rc;
 834
 835         if (unlikely(qsd == NULL))
 836                 return 0;
 837
 838         LASSERT(qtype >= 0 && qtype < LL_MAXQUOTAS);
 839         qi->lqi_type = qtype;
 840
 841         /* inode accounting */
 842         qi->lqi_is_blk = false;
 843
 844         /* one more inode for the new owner ... */
 845         qi->lqi_id.qid_uid = new_id;
 846         qi->lqi_space      = 1;
 847         rc = qsd_op_begin(env, qsd, trans, qi, NULL);
 848         if (ignore_edquot && (rc == -EDQUOT || rc == -EINPROGRESS))
 849                 rc = 0;
 850         if (rc)
 851                 return rc;
 852
 853         /* and one less inode for the current id */
 854         qi->lqi_id.qid_uid = orig_id;;
 855         qi->lqi_space      = -1;
 856         /* can't get EDQUOT when reducing usage */
 857         rc = qsd_op_begin(env, qsd, trans, qi, NULL);
 858         if (rc == -EINPROGRESS)
 859                 rc = 0;
 860         if (rc)
 861                 return rc;
 862
 863         /* block accounting */
 864         qi->lqi_is_blk = true;
 865
 866         /* more blocks for the new owner ... */
 867         qi->lqi_id.qid_uid = new_id;
 868         qi->lqi_space      = bspace;
 869         rc = qsd_op_begin(env, qsd, trans, qi, NULL);
 870         if (ignore_edquot && (rc == -EDQUOT || rc == -EINPROGRESS))
 871                 rc = 0;
 872         if (rc)
 873                 return rc;
 874
 875         /* and finally less blocks for the current owner */
 876         qi->lqi_id.qid_uid = orig_id;
 877         qi->lqi_space      = -bspace;
 878         rc = qsd_op_begin(env, qsd, trans, qi, NULL);
 879         /* can't get EDQUOT when reducing usage */
 880         if (rc == -EINPROGRESS)
 881                 rc = 0;
 882         return rc;
 883 }
 884
 885 static int osd_declare_attr_set(const struct lu_env *env,
 886                                 struct dt_object *dt,
 887                                 const struct lu_attr *attr,
 888                                 struct thandle *handle)
 889 {
 890         struct osd_thread_info  *info = osd_oti_get(env);
 891         struct osd_object       *obj = osd_dt_obj(dt);
 892         struct osd_device       *osd = osd_obj2dev(obj);
 893         dmu_tx_hold_t           *txh;
 894         struct osd_thandle      *oh;
 895         uint64_t                 bspace;
 896         uint32_t                 blksize;
 897         int                      rc = 0;
 898         bool                     found;
 899         ENTRY;
 900
 901
 902         LASSERT(handle != NULL);
 903         LASSERT(osd_invariant(obj));
 904
 905         oh = container_of0(handle, struct osd_thandle, ot_super);
 906
 907         down_read(&obj->oo_guard);
 908         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
 909                 GOTO(out, rc = 0);
 910
 911         LASSERT(obj->oo_sa_hdl != NULL);
 912         LASSERT(oh->ot_tx != NULL);
 913         /* regular attributes are part of the bonus buffer */
 914         /* let's check whether this object is already part of
 915          * transaction.. */
 916         found = false;
 917         for (txh = list_head(&oh->ot_tx->tx_holds); txh;
 918              txh = list_next(&oh->ot_tx->tx_holds, txh)) {
 919                 if (txh->txh_dnode == NULL)
 920                         continue;
 921                 if (txh->txh_dnode->dn_object != obj->oo_dn->dn_object)
 922                         continue;
 923                 /* this object is part of the transaction already
 924                  * we don't need to declare bonus again */
 925                 found = true;
 926                 break;
 927         }
 928         if (!found)
 929                 dmu_tx_hold_bonus(oh->ot_tx, obj->oo_dn->dn_object);
 930         if (oh->ot_tx->tx_err != 0)
 931                 GOTO(out, rc = -oh->ot_tx->tx_err);
 932
 933         if (attr && attr->la_valid & LA_FLAGS) {
 934                 /* LMA is usually a part of bonus, no need to declare
 935                  * anything else */
 936         }
 937
 938         if (attr && (attr->la_valid & (LA_UID | LA_GID | LA_PROJID))) {
 939                 sa_object_size(obj->oo_sa_hdl, &blksize, &bspace);
 940                 bspace = toqb(bspace * 512);
 941
 942                 CDEBUG(D_QUOTA, "%s: enforce quota on UID %u, GID %u,"
 943                        "the quota space is %lld (%u)\n", osd->od_svname,
 944                        attr->la_uid, attr->la_gid, bspace, blksize);
 945         }
 946
 947         if (attr && attr->la_valid & LA_UID) {
 948                 /* quota enforcement for user */
 949                 if (attr->la_uid != obj->oo_attr.la_uid) {
 950                         rc = qsd_transfer(env, osd->od_quota_slave,
 951                                           &oh->ot_quota_trans, USRQUOTA,
 952                                           obj->oo_attr.la_uid, attr->la_uid,
 953                                           bspace, &info->oti_qi, true);
 954                         if (rc)
 955                                 GOTO(out, rc);
 956                 }
 957         }
 958         if (attr && attr->la_valid & LA_GID) {
 959                 /* quota enforcement for group */
 960                 if (attr->la_gid != obj->oo_attr.la_gid) {
 961                         rc = qsd_transfer(env, osd->od_quota_slave,
 962                                           &oh->ot_quota_trans, GRPQUOTA,
 963                                           obj->oo_attr.la_gid, attr->la_gid,
 964                                           bspace, &info->oti_qi,
 965                                           !(attr->la_flags &
 966                                                         LUSTRE_SET_SYNC_FL));
 967                         if (rc)
 968                                 GOTO(out, rc);
 969                 }
 970         }
 971 #ifdef ZFS_PROJINHERIT
 972         if (attr && attr->la_valid & LA_PROJID) {
 973                 if (!osd->od_projectused_dn)
 974                         GOTO(out, rc = -EOPNOTSUPP);
 975
 976                 /* Usually, if project quota is upgradable for the device,
 977                  * then the upgrade will be done before or when mount the
 978                  * device. So when we come here, this project should have
 979                  * project ID attribute already (that is zero by default).
 980                  * Otherwise, there was something wrong during the former
 981                  * upgrade, let's return failure to report that.
 982                  *
 983                  * Please note that, different from other attributes, you
 984                  * can NOT simply set the project ID attribute under such
 985                  * case, because adding (NOT change) project ID attribute
 986                  * needs to change the object's attribute layout to match
 987                  * zfs backend quota accounting requirement. */
 988                 if (unlikely(!obj->oo_with_projid))
 989                         GOTO(out, rc = -ENXIO);
 990
 991                 /* quota enforcement for project */
 992                 if (attr->la_projid != obj->oo_attr.la_projid) {
 993                         rc = qsd_transfer(env, osd->od_quota_slave,
 994                                           &oh->ot_quota_trans, PRJQUOTA,
 995                                           obj->oo_attr.la_projid,
 996                                           attr->la_projid, bspace,
 997                                           &info->oti_qi, true);
 998                         if (rc)
 999                                 GOTO(out, rc);
1000                 }
1001         }
1002 #endif
1003 out:
1004         up_read(&obj->oo_guard);
1005         RETURN(rc);
1006 }
1007
1008 /*
1009  * Set the attributes of an object
1010  *
1011  * The transaction passed to this routine must have
1012  * dmu_tx_hold_bonus(tx, oid) called and then assigned
1013  * to a transaction group.
1014  */
1015 static int osd_attr_set(const struct lu_env *env, struct dt_object *dt,
1016                         const struct lu_attr *la, struct thandle *handle)
1017 {
1018         struct osd_thread_info  *info = osd_oti_get(env);
1019         sa_bulk_attr_t          *bulk = osd_oti_get(env)->oti_attr_bulk;
1020         struct osd_object       *obj = osd_dt_obj(dt);
1021         struct osd_device       *osd = osd_obj2dev(obj);
1022         struct osd_thandle      *oh;
1023         struct osa_attr         *osa = &info->oti_osa;
1024         __u64                    valid = la->la_valid;
1025         int                      cnt;
1026         int                      rc = 0;
1027
1028         ENTRY;
1029
1030         down_read(&obj->oo_guard);
1031         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
1032                 GOTO(out, rc = -ENOENT);
1033
1034         LASSERT(handle != NULL);
1035         LASSERT(osd_invariant(obj));
1036         LASSERT(obj->oo_sa_hdl);
1037
1038         oh = container_of0(handle, struct osd_thandle, ot_super);
1039         /* Assert that the transaction has been assigned to a
1040            transaction group. */
1041         LASSERT(oh->ot_tx->tx_txg != 0);
1042
1043         /* Only allow set size for regular file */
1044         if (!S_ISREG(dt->do_lu.lo_header->loh_attr))
1045                 valid &= ~(LA_SIZE | LA_BLOCKS);
1046
1047         if (valid & LA_CTIME && la->la_ctime == obj->oo_attr.la_ctime)
1048                 valid &= ~LA_CTIME;
1049
1050         if (valid & LA_MTIME && la->la_mtime == obj->oo_attr.la_mtime)
1051                 valid &= ~LA_MTIME;
1052
1053         if (valid & LA_ATIME && la->la_atime == obj->oo_attr.la_atime)
1054                 valid &= ~LA_ATIME;
1055
1056         if (valid == 0)
1057                 GOTO(out, rc = 0);
1058
1059         if (valid & LA_FLAGS) {
1060                 struct lustre_mdt_attrs *lma;
1061                 struct lu_buf buf;
1062                 int size = 0;
1063
1064                 if (la->la_flags & LUSTRE_LMA_FL_MASKS) {
1065                         CLASSERT(sizeof(info->oti_buf) >= sizeof(*lma));
1066                         lma = (struct lustre_mdt_attrs *)&info->oti_buf;
1067                         buf.lb_buf = lma;
1068                         buf.lb_len = sizeof(info->oti_buf);
1069
1070                         /* Please NOT call osd_xattr_get() directly, that
1071                          * will cause recursively down_read() on oo_gurad. */
1072                         rc = osd_xattr_get_internal(env, obj, &buf,
1073                                                     XATTR_NAME_LMA, &size);
1074                         if (!rc && unlikely(size < sizeof(*lma)))
1075                                 rc = -EINVAL;
1076                         if (!rc) {
1077                                 lma->lma_incompat =
1078                                         le32_to_cpu(lma->lma_incompat);
1079                                 lma->lma_incompat |=
1080                                         lustre_to_lma_flags(la->la_flags);
1081                                 lma->lma_incompat =
1082                                         cpu_to_le32(lma->lma_incompat);
1083                                 buf.lb_buf = lma;
1084                                 buf.lb_len = sizeof(*lma);
1085                                 rc = osd_xattr_set_internal(env, obj, &buf,
1086                                                             XATTR_NAME_LMA,
1087                                                             LU_XATTR_REPLACE,
1088                                                             oh);
1089                         }
1090                         if (rc < 0) {
1091                                 CWARN("%s: failed to set LMA flags: rc = %d\n",
1092                                        osd->od_svname, rc);
1093                                 GOTO(out, rc);
1094                         }
1095                 }
1096         }
1097
1098         write_lock(&obj->oo_attr_lock);
1099         cnt = 0;
1100
1101         if (valid & LA_PROJID) {
1102 #ifdef ZFS_PROJINHERIT
1103                 /* osd_declare_attr_set() must be called firstly.
1104                  * If osd::od_projectused_dn is not set, then we
1105                  * can not arrive at here. */
1106                 LASSERT(osd->od_projectused_dn);
1107                 LASSERT(obj->oo_with_projid);
1108
1109                 osa->projid = obj->oo_attr.la_projid = la->la_projid;
1110                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PROJID(osd), NULL,
1111                                  &osa->projid, 8);
1112 #else
1113                 valid &= ~LA_PROJID;
1114 #endif
1115         }
1116
1117         if (valid & LA_ATIME) {
1118                 osa->atime[0] = obj->oo_attr.la_atime = la->la_atime;
1119                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(osd), NULL,
1120                                  osa->atime, 16);
1121         }
1122         if (valid & LA_MTIME) {
1123                 osa->mtime[0] = obj->oo_attr.la_mtime = la->la_mtime;
1124                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(osd), NULL,
1125                                  osa->mtime, 16);
1126         }
1127         if (valid & LA_CTIME) {
1128                 osa->ctime[0] = obj->oo_attr.la_ctime = la->la_ctime;
1129                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(osd), NULL,
1130                                  osa->ctime, 16);
1131         }
1132         if (valid & LA_MODE) {
1133                 /* mode is stored along with type, so read it first */
1134                 obj->oo_attr.la_mode = (obj->oo_attr.la_mode & S_IFMT) |
1135                         (la->la_mode & ~S_IFMT);
1136                 osa->mode = obj->oo_attr.la_mode;
1137                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(osd), NULL,
1138                                  &osa->mode, 8);
1139         }
1140         if (valid & LA_SIZE) {
1141                 osa->size = obj->oo_attr.la_size = la->la_size;
1142                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(osd), NULL,
1143                                  &osa->size, 8);
1144         }
1145         if (valid & LA_NLINK) {
1146                 osa->nlink = obj->oo_attr.la_nlink = la->la_nlink;
1147                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(osd), NULL,
1148                                  &osa->nlink, 8);
1149         }
1150         if (valid & LA_RDEV) {
1151                 osa->rdev = obj->oo_attr.la_rdev = la->la_rdev;
1152                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(osd), NULL,
1153                                  &osa->rdev, 8);
1154         }
1155         if (valid & LA_FLAGS) {
1156                 osa->flags = attrs_fs2zfs(la->la_flags);
1157                 /* many flags are not supported by zfs, so ensure a good cached
1158                  * copy */
1159                 obj->oo_attr.la_flags = attrs_zfs2fs(osa->flags);
1160 #ifdef ZFS_PROJINHERIT
1161                 if (obj->oo_with_projid)
1162                         osa->flags |= ZFS_PROJID;
1163 #endif
1164                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(osd), NULL,
1165                                  &osa->flags, 8);
1166         }
1167         if (valid & LA_UID) {
1168                 osa->uid = obj->oo_attr.la_uid = la->la_uid;
1169                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(osd), NULL,
1170                                  &osa->uid, 8);
1171         }
1172         if (valid & LA_GID) {
1173                 osa->gid = obj->oo_attr.la_gid = la->la_gid;
1174                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(osd), NULL,
1175                                  &osa->gid, 8);
1176         }
1177         obj->oo_attr.la_valid |= valid;
1178         write_unlock(&obj->oo_attr_lock);
1179
1180         LASSERT(cnt <= ARRAY_SIZE(osd_oti_get(env)->oti_attr_bulk));
1181         rc = osd_object_sa_bulk_update(obj, bulk, cnt, oh);
1182
1183 out:
1184         up_read(&obj->oo_guard);
1185         RETURN(rc);
1186 }
1187
1188 /*
1189  * Object creation.
1190  *
1191  * XXX temporary solution.
1192  */
1193
1194 static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah,
1195                         struct dt_object *parent, struct dt_object *child,
1196                         umode_t child_mode)
1197 {
1198         LASSERT(ah);
1199
1200         ah->dah_parent = parent;
1201         ah->dah_mode = child_mode;
1202
1203         if (parent != NULL && !dt_object_remote(parent)) {
1204                 /* will help to find FID->ino at dt_insert("..") */
1205                 struct osd_object *pobj = osd_dt_obj(parent);
1206
1207                 osd_idc_find_and_init(env, osd_obj2dev(pobj), pobj);
1208         }
1209 }
1210
1211 static int osd_declare_create(const struct lu_env *env, struct dt_object *dt,
1212                               struct lu_attr *attr,
1213                               struct dt_allocation_hint *hint,
1214                               struct dt_object_format *dof,
1215                               struct thandle *handle)
1216 {
1217         const struct lu_fid     *fid = lu_object_fid(&dt->do_lu);
1218         struct osd_object       *obj = osd_dt_obj(dt);
1219         struct osd_device       *osd = osd_obj2dev(obj);
1220         struct osd_thandle      *oh;
1221         uint64_t                 zapid;
1222         dnode_t                 *dn;
1223         int                      rc, dnode_size;
1224         ENTRY;
1225
1226         LASSERT(dof);
1227
1228         switch (dof->dof_type) {
1229                 case DFT_REGULAR:
1230                 case DFT_SYM:
1231                 case DFT_NODE:
1232                         if (obj->oo_dt.do_body_ops == NULL)
1233                                 obj->oo_dt.do_body_ops = &osd_body_ops;
1234                         break;
1235                 default:
1236                         break;
1237         }
1238
1239         LASSERT(handle != NULL);
1240         oh = container_of0(handle, struct osd_thandle, ot_super);
1241         LASSERT(oh->ot_tx != NULL);
1242
1243         /* this is the minimum set of EAs on every Lustre object */
1244         obj->oo_ea_in_bonus = ZFS_SA_BASE_ATTR_SIZE +
1245                                 sizeof(__u64) + /* VBR VERSION */
1246                                 sizeof(struct lustre_mdt_attrs); /* LMA */
1247         /* reserve 32 bytes for extra stuff like ACLs */
1248         dnode_size = size_roundup_power2(obj->oo_ea_in_bonus + 32);
1249
1250         switch (dof->dof_type) {
1251                 case DFT_DIR:
1252                         dt->do_index_ops = &osd_dir_ops;
1253                 case DFT_INDEX:
1254                         /* for zap create */
1255                         dmu_tx_hold_zap(oh->ot_tx, DMU_NEW_OBJECT, FALSE, NULL);
1256                         dmu_tx_hold_sa_create(oh->ot_tx, dnode_size);
1257                         break;
1258                 case DFT_REGULAR:
1259                 case DFT_SYM:
1260                 case DFT_NODE:
1261                         /* first, we'll create new object */
1262                         dmu_tx_hold_sa_create(oh->ot_tx, dnode_size);
1263                         break;
1264
1265                 default:
1266                         LBUG();
1267                         break;
1268         }
1269
1270         /* and we'll add it to some mapping */
1271         zapid = osd_get_name_n_idx(env, osd, fid, NULL, 0, &dn);
1272         osd_tx_hold_zap(oh->ot_tx, zapid, dn, TRUE, NULL);
1273
1274         /* will help to find FID->ino mapping at dt_insert() */
1275         osd_idc_find_and_init(env, osd, obj);
1276
1277         rc = osd_declare_quota(env, osd, attr->la_uid, attr->la_gid,
1278                                attr->la_projid, 1, oh, NULL, OSD_QID_INODE);
1279
1280         RETURN(rc);
1281 }
1282
1283 int __osd_attr_init(const struct lu_env *env, struct osd_device *osd,
1284                     struct osd_object *obj, sa_handle_t *sa_hdl, dmu_tx_t *tx,
1285                     struct lu_attr *la, uint64_t parent,
1286                     nvlist_t *xattr)
1287 {
1288         sa_bulk_attr_t  *bulk = osd_oti_get(env)->oti_attr_bulk;
1289         struct osa_attr *osa = &osd_oti_get(env)->oti_osa;
1290         uint64_t         gen;
1291         uint64_t         crtime[2];
1292         timestruc_t      now;
1293         int              cnt;
1294         int              rc;
1295         char *dxattr = NULL;
1296         size_t sa_size;
1297
1298
1299         LASSERT(sa_hdl);
1300
1301         gen = dmu_tx_get_txg(tx);
1302         gethrestime(&now);
1303         ZFS_TIME_ENCODE(&now, crtime);
1304
1305         osa->atime[0] = la->la_atime;
1306         osa->ctime[0] = la->la_ctime;
1307         osa->mtime[0] = la->la_mtime;
1308         osa->mode = la->la_mode;
1309         osa->uid = la->la_uid;
1310         osa->gid = la->la_gid;
1311         osa->rdev = la->la_rdev;
1312         osa->nlink = la->la_nlink;
1313         if (la->la_valid & LA_FLAGS)
1314                 osa->flags = attrs_fs2zfs(la->la_flags);
1315         else
1316                 osa->flags = 0;
1317         osa->size  = la->la_size;
1318 #ifdef ZFS_PROJINHERIT
1319         if (osd->od_projectused_dn) {
1320                 if (la->la_valid & LA_PROJID)
1321                         osa->projid = la->la_projid;
1322                 else
1323                         osa->projid = ZFS_DEFAULT_PROJID;
1324                 osa->flags |= ZFS_PROJID;
1325                 if (obj)
1326                         obj->oo_with_projid = 1;
1327         } else {
1328                 osa->flags &= ~ZFS_PROJID;
1329         }
1330 #endif
1331
1332         /*
1333          * we need to create all SA below upon object create.
1334          *
1335          * XXX The attribute order matters since the accounting callback relies
1336          * on static offsets (i.e. SA_*_OFFSET, see zfs_space_delta_cb()) to
1337          * look up the UID/GID/PROJID attributes. Moreover, the callback does
1338          * not seem to support the spill block.
1339          * We define attributes in the same order as SA_*_OFFSET in order to
1340          * work around the problem. See ORI-610.
1341          */
1342         cnt = 0;
1343         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(osd), NULL, &osa->mode, 8);
1344         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(osd), NULL, &osa->size, 8);
1345         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GEN(osd), NULL, &gen, 8);
1346         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(osd), NULL, &osa->uid, 8);
1347         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(osd), NULL, &osa->gid, 8);
1348         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PARENT(osd), NULL, &parent, 8);
1349         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(osd), NULL, &osa->flags, 8);
1350         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(osd), NULL, osa->atime, 16);
1351         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(osd), NULL, osa->mtime, 16);
1352         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(osd), NULL, osa->ctime, 16);
1353         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CRTIME(osd), NULL, crtime, 16);
1354         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(osd), NULL, &osa->nlink, 8);
1355 #ifdef ZFS_PROJINHERIT
1356         if (osd->od_projectused_dn)
1357                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PROJID(osd), NULL,
1358                                  &osa->projid, 8);
1359 #endif
1360         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(osd), NULL, &osa->rdev, 8);
1361         LASSERT(cnt <= ARRAY_SIZE(osd_oti_get(env)->oti_attr_bulk));
1362
1363         if (xattr) {
1364                 rc = -nvlist_size(xattr, &sa_size, NV_ENCODE_XDR);
1365                 LASSERT(rc == 0);
1366
1367                 dxattr = osd_zio_buf_alloc(sa_size);
1368                 LASSERT(dxattr);
1369
1370                 rc = -nvlist_pack(xattr, &dxattr, &sa_size,
1371                                 NV_ENCODE_XDR, KM_SLEEP);
1372                 LASSERT(rc == 0);
1373
1374                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_DXATTR(osd),
1375                                 NULL, dxattr, sa_size);
1376         }
1377
1378         rc = -sa_replace_all_by_template(sa_hdl, bulk, cnt, tx);
1379         if (dxattr)
1380                 osd_zio_buf_free(dxattr, sa_size);
1381
1382         return rc;
1383 }
1384
1385 static int osd_find_new_dnode(const struct lu_env *env, dmu_tx_t *tx,
1386                               uint64_t oid, dnode_t **dnp)
1387 {
1388         dmu_tx_hold_t *txh;
1389         int rc = 0;
1390
1391         /* take dnode_t from tx to save on dnode#->dnode_t lookup */
1392         for (txh = list_tail(&tx->tx_holds); txh;
1393              txh = list_prev(&tx->tx_holds, txh)) {
1394                 dnode_t *dn = txh->txh_dnode;
1395                 dmu_buf_impl_t *db;
1396
1397                 if (dn == NULL)
1398                         continue;
1399                 if (dn->dn_object != oid)
1400                         continue;
1401                 db = dn->dn_bonus;
1402                 if (db == NULL) {
1403                         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1404                         if (dn->dn_bonus == NULL)
1405                                 dbuf_create_bonus(dn);
1406                         rw_exit(&dn->dn_struct_rwlock);
1407                 }
1408                 db = dn->dn_bonus;
1409                 LASSERT(db);
1410                 LASSERT(dn->dn_handle);
1411                 DB_DNODE_ENTER(db);
1412                 if (refcount_add(&db->db_holds, osd_obj_tag) == 1) {
1413                         refcount_add(&dn->dn_holds, tag);
1414                         atomic_inc_32(&dn->dn_dbufs_count);
1415                 }
1416                 *dnp = dn;
1417                 dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH);
1418                 break;
1419         }
1420
1421         if (unlikely(*dnp == NULL))
1422                 rc = __osd_obj2dnode(tx->tx_objset, oid, dnp);
1423
1424         return rc;
1425 }
1426
1427 #ifdef HAVE_DMU_OBJECT_ALLOC_DNSIZE
1428 static int osd_find_dnsize(struct osd_object *obj)
1429 {
1430         struct osd_device *osd = osd_obj2dev(obj);
1431         int dnsize;
1432
1433         if (osd->od_dnsize == ZFS_DNSIZE_AUTO) {
1434                 dnsize = DNODE_MIN_SIZE;
1435                 do {
1436                         if (DN_BONUS_SIZE(dnsize) >= obj->oo_ea_in_bonus + 32)
1437                                 break;
1438                         dnsize <<= 1;
1439                 } while (dnsize < DNODE_MAX_SIZE);
1440                 if (dnsize > DNODE_MAX_SIZE)
1441                         dnsize = DNODE_MAX_SIZE;
1442         } else if (osd->od_dnsize == ZFS_DNSIZE_1K) {
1443                 dnsize = 1024;
1444         } else if (osd->od_dnsize == ZFS_DNSIZE_2K) {
1445                 dnsize = 2048;
1446         } else if (osd->od_dnsize == ZFS_DNSIZE_4K) {
1447                 dnsize = 4096;
1448         } else if (osd->od_dnsize == ZFS_DNSIZE_8K) {
1449                 dnsize = 8192;
1450         } else if (osd->od_dnsize == ZFS_DNSIZE_16K) {
1451                 dnsize = 16384;
1452         } else {
1453                 dnsize = DNODE_MIN_SIZE;
1454         }
1455         return dnsize;
1456 }
1457 #else
1458 static int inline osd_find_dnsize(struct osd_object *obj)
1459 {
1460         return DN_MAX_BONUSLEN;
1461 }
1462 #endif
1463
1464 /*
1465  * The transaction passed to this routine must have
1466  * dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT) called and then assigned
1467  * to a transaction group.
1468  */
1469 int __osd_object_create(const struct lu_env *env, struct osd_object *obj,
1470                         dnode_t **dnp, dmu_tx_t *tx, struct lu_attr *la)
1471 {
1472         struct osd_device   *osd = osd_obj2dev(obj);
1473         const struct lu_fid *fid = lu_object_fid(&obj->oo_dt.do_lu);
1474         dmu_object_type_t    type = DMU_OT_PLAIN_FILE_CONTENTS;
1475         uint64_t oid;
1476
1477         /* Use DMU_OTN_UINT8_METADATA for local objects so their data blocks
1478          * would get an additional ditto copy */
1479         if (unlikely(S_ISREG(la->la_mode) &&
1480                      fid_seq_is_local_file(fid_seq(fid))))
1481                 type = DMU_OTN_UINT8_METADATA;
1482
1483         /* Create a new DMU object using the default dnode size. */
1484         oid = osd_dmu_object_alloc(osd->od_os, type, 0,
1485                                    osd_find_dnsize(obj), tx);
1486
1487         LASSERT(la->la_valid & LA_MODE);
1488         la->la_size = 0;
1489         la->la_nlink = 1;
1490
1491         return osd_find_new_dnode(env, tx, oid, dnp);
1492 }
1493
1494 /*
1495  * The transaction passed to this routine must have
1496  * dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, ...) called and then assigned
1497  * to a transaction group.
1498  *
1499  * Using ZAP_FLAG_HASH64 will force the ZAP to always be a FAT ZAP.
1500  * This is fine for directories today, because storing the FID in the dirent
1501  * will also require a FAT ZAP.  If there is a new type of micro ZAP created
1502  * then we might need to re-evaluate the use of this flag and instead do
1503  * a conversion from the different internal ZAP hash formats being used. */
1504 int __osd_zap_create(const struct lu_env *env, struct osd_device *osd,
1505                      dnode_t **dnp, dmu_tx_t *tx, struct lu_attr *la,
1506                      unsigned dnsize, zap_flags_t flags)
1507 {
1508         uint64_t oid;
1509
1510         /* Assert that the transaction has been assigned to a
1511            transaction group. */
1512         LASSERT(tx->tx_txg != 0);
1513         *dnp = NULL;
1514
1515         oid = osd_zap_create_flags(osd->od_os, 0, flags | ZAP_FLAG_HASH64,
1516                                    DMU_OT_DIRECTORY_CONTENTS,
1517                                    14, /* == ZFS fzap_default_blockshift */
1518                                    DN_MAX_INDBLKSHIFT, /* indirect blockshift */
1519                                    dnsize, tx);
1520
1521         la->la_size = 2;
1522         la->la_nlink = 1;
1523
1524         return osd_find_new_dnode(env, tx, oid, dnp);
1525 }
1526
1527 static dnode_t *osd_mkidx(const struct lu_env *env, struct osd_object *obj,
1528                           struct lu_attr *la, struct osd_thandle *oh)
1529 {
1530         dnode_t *dn;
1531         int rc;
1532
1533         /* Index file should be created as regular file in order not to confuse
1534          * ZPL which could interpret them as directory.
1535          * We set ZAP_FLAG_UINT64_KEY to let ZFS know than we are going to use
1536          * binary keys */
1537         LASSERT(S_ISREG(la->la_mode));
1538         rc = __osd_zap_create(env, osd_obj2dev(obj), &dn, oh->ot_tx, la,
1539                               osd_find_dnsize(obj), ZAP_FLAG_UINT64_KEY);
1540         if (rc)
1541                 return ERR_PTR(rc);
1542         return dn;
1543 }
1544
1545 static dnode_t *osd_mkdir(const struct lu_env *env, struct osd_object *obj,
1546                           struct lu_attr *la, struct osd_thandle *oh)
1547 {
1548         dnode_t *dn;
1549         int rc;
1550
1551         LASSERT(S_ISDIR(la->la_mode));
1552         rc = __osd_zap_create(env, osd_obj2dev(obj), &dn, oh->ot_tx, la,
1553                               osd_find_dnsize(obj), 0);
1554         if (rc)
1555                 return ERR_PTR(rc);
1556         return dn;
1557 }
1558
1559 static dnode_t *osd_mkreg(const struct lu_env *env, struct osd_object *obj,
1560                           struct lu_attr *la, struct osd_thandle *oh)
1561 {
1562         const struct lu_fid *fid = lu_object_fid(&obj->oo_dt.do_lu);
1563         struct osd_device *osd = osd_obj2dev(obj);
1564         dnode_t *dn;
1565         int rc;
1566
1567         LASSERT(S_ISREG(la->la_mode));
1568         rc = __osd_object_create(env, obj, &dn, oh->ot_tx, la);
1569         if (rc)
1570                 return ERR_PTR(rc);
1571
1572         if ((fid_is_idif(fid) || fid_is_norm(fid) || fid_is_echo(fid)) &&
1573             osd->od_is_ost) {
1574                 /* The minimum block size must be at least page size otherwise
1575                  * it will break the assumption in tgt_thread_big_cache where
1576                  * the array size is PTLRPC_MAX_BRW_PAGES. It will also affect
1577                  * RDMA due to subpage transfer size */
1578                 rc = -dmu_object_set_blocksize(osd->od_os, dn->dn_object,
1579                                                PAGE_SIZE, 0, oh->ot_tx);
1580                 if (unlikely(rc)) {
1581                         CERROR("%s: can't change blocksize: %d\n",
1582                                osd->od_svname, rc);
1583                         return ERR_PTR(rc);
1584                 }
1585         }
1586
1587         return dn;
1588 }
1589
1590 static dnode_t *osd_mksym(const struct lu_env *env, struct osd_object *obj,
1591                           struct lu_attr *la, struct osd_thandle *oh)
1592 {
1593         dnode_t *dn;
1594         int rc;
1595
1596         LASSERT(S_ISLNK(la->la_mode));
1597         rc = __osd_object_create(env, obj, &dn, oh->ot_tx, la);
1598         if (rc)
1599                 return ERR_PTR(rc);
1600         return dn;
1601 }
1602
1603 static dnode_t *osd_mknod(const struct lu_env *env, struct osd_object *obj,
1604                           struct lu_attr *la, struct osd_thandle *oh)
1605 {
1606         dnode_t *dn;
1607         int rc;
1608
1609         if (S_ISCHR(la->la_mode) || S_ISBLK(la->la_mode))
1610                 la->la_valid |= LA_RDEV;
1611
1612         rc = __osd_object_create(env, obj, &dn, oh->ot_tx, la);
1613         if (rc)
1614                 return ERR_PTR(rc);
1615         return dn;
1616 }
1617
1618 typedef dnode_t *(*osd_obj_type_f)(const struct lu_env *env,
1619                                    struct osd_object *obj,
1620                                    struct lu_attr *la,
1621                                    struct osd_thandle *oh);
1622
1623 static osd_obj_type_f osd_create_type_f(enum dt_format_type type)
1624 {
1625         osd_obj_type_f result;
1626
1627         switch (type) {
1628         case DFT_DIR:
1629                 result = osd_mkdir;
1630                 break;
1631         case DFT_INDEX:
1632                 result = osd_mkidx;
1633                 break;
1634         case DFT_REGULAR:
1635                 result = osd_mkreg;
1636                 break;
1637         case DFT_SYM:
1638                 result = osd_mksym;
1639                 break;
1640         case DFT_NODE:
1641                 result = osd_mknod;
1642                 break;
1643         default:
1644                 LBUG();
1645                 break;
1646         }
1647         return result;
1648 }
1649
1650 /*
1651  * Concurrency: @dt is write locked.
1652  */
1653 static int osd_create(const struct lu_env *env, struct dt_object *dt,
1654                       struct lu_attr *attr, struct dt_allocation_hint *hint,
1655                       struct dt_object_format *dof, struct thandle *th)
1656 {
1657         struct osd_thread_info  *info = osd_oti_get(env);
1658         struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs;
1659         struct zpl_direntry     *zde = &info->oti_zde.lzd_reg;
1660         const struct lu_fid     *fid = lu_object_fid(&dt->do_lu);
1661         struct osd_object       *obj = osd_dt_obj(dt);
1662         struct osd_device       *osd = osd_obj2dev(obj);
1663         char                    *buf = info->oti_str;
1664         struct osd_thandle      *oh;
1665         dnode_t *dn = NULL, *zdn = NULL;
1666         uint64_t                 zapid, parent = 0;
1667         int                      rc;
1668
1669         ENTRY;
1670
1671         LASSERT(!fid_is_acct(fid));
1672
1673         /* concurrent create declarations should not see
1674          * the object inconsistent (db, attr, etc).
1675          * in regular cases acquisition should be cheap */
1676         down_write(&obj->oo_guard);
1677
1678         if (unlikely(dt_object_exists(dt)))
1679                 GOTO(out, rc = -EEXIST);
1680
1681         LASSERT(osd_invariant(obj));
1682         LASSERT(dof != NULL);
1683
1684         LASSERT(th != NULL);
1685         oh = container_of0(th, struct osd_thandle, ot_super);
1686
1687         LASSERT(obj->oo_dn == NULL);
1688
1689         /* to follow ZFS on-disk format we need
1690          * to initialize parent dnode properly */
1691         if (hint != NULL && hint->dah_parent != NULL &&
1692             !dt_object_remote(hint->dah_parent))
1693                 parent = osd_dt_obj(hint->dah_parent)->oo_dn->dn_object;
1694
1695         /* we may fix some attributes, better do not change the source */
1696         obj->oo_attr = *attr;
1697         obj->oo_attr.la_valid |= LA_SIZE | LA_NLINK | LA_TYPE;
1698
1699 #ifdef ZFS_PROJINHERIT
1700         if (osd->od_projectused_dn) {
1701                 if (!(obj->oo_attr.la_valid & LA_PROJID))
1702                         obj->oo_attr.la_projid = ZFS_DEFAULT_PROJID;
1703                 obj->oo_with_projid = 1;
1704         }
1705 #endif
1706
1707         dn = osd_create_type_f(dof->dof_type)(env, obj, &obj->oo_attr, oh);
1708         if (IS_ERR(dn)) {
1709                 rc = PTR_ERR(dn);
1710                 dn = NULL;
1711                 GOTO(out, rc);
1712         }
1713
1714         zde->zde_pad = 0;
1715         zde->zde_dnode = dn->dn_object;
1716         zde->zde_type = IFTODT(attr->la_mode & S_IFMT);
1717
1718         zapid = osd_get_name_n_idx(env, osd, fid, buf,
1719                                    sizeof(info->oti_str), &zdn);
1720         rc = osd_zap_add(osd, zapid, zdn, buf, 8, 1, zde, oh->ot_tx);
1721         if (rc)
1722                 GOTO(out, rc);
1723         obj->oo_dn = dn;
1724         /* Now add in all of the "SA" attributes */
1725         rc = osd_sa_handle_get(obj);
1726         if (rc)
1727                 GOTO(out, rc);
1728
1729         rc = -nvlist_alloc(&obj->oo_sa_xattr, NV_UNIQUE_NAME, KM_SLEEP);
1730         if (rc)
1731                 GOTO(out, rc);
1732
1733         /* initialize LMA */
1734         lustre_lma_init(lma, fid, 0, 0);
1735         lustre_lma_swab(lma);
1736         rc = -nvlist_add_byte_array(obj->oo_sa_xattr, XATTR_NAME_LMA,
1737                                     (uchar_t *)lma, sizeof(*lma));
1738         if (rc)
1739                 GOTO(out, rc);
1740
1741         /* configure new osd object */
1742         obj->oo_parent = parent != 0 ? parent : zapid;
1743         obj->oo_late_attr_set = 1;
1744         rc = __osd_sa_xattr_schedule_update(env, obj, oh);
1745         if (rc)
1746                 GOTO(out, rc);
1747
1748         /* XXX: oo_lma_flags */
1749         obj->oo_dt.do_lu.lo_header->loh_attr |= obj->oo_attr.la_mode & S_IFMT;
1750         if (likely(!fid_is_acct(lu_object_fid(&obj->oo_dt.do_lu))))
1751                 /* no body operations for accounting objects */
1752                 obj->oo_dt.do_body_ops = &osd_body_ops;
1753
1754         osd_idc_find_and_init(env, osd, obj);
1755
1756 out:
1757         if (unlikely(rc && dn)) {
1758                 dmu_object_free(osd->od_os, dn->dn_object, oh->ot_tx);
1759                 osd_dnode_rele(dn);
1760                 obj->oo_dn = NULL;
1761         } else if (!rc) {
1762                 obj->oo_dt.do_lu.lo_header->loh_attr |= LOHA_EXISTS;
1763         }
1764         up_write(&obj->oo_guard);
1765         RETURN(rc);
1766 }
1767
1768 static int osd_declare_ref_add(const struct lu_env *env, struct dt_object *dt,
1769                                struct thandle *th)
1770 {
1771         osd_idc_find_and_init(env, osd_dev(dt->do_lu.lo_dev), osd_dt_obj(dt));
1772         return osd_declare_attr_set(env, dt, NULL, th);
1773 }
1774
1775 /*
1776  * Concurrency: @dt is write locked.
1777  */
1778 static int osd_ref_add(const struct lu_env *env, struct dt_object *dt,
1779                        struct thandle *handle)
1780 {
1781         struct osd_object       *obj = osd_dt_obj(dt);
1782         struct osd_thandle      *oh;
1783         struct osd_device       *osd = osd_obj2dev(obj);
1784         uint64_t                 nlink;
1785         int rc;
1786
1787         ENTRY;
1788
1789         down_read(&obj->oo_guard);
1790         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
1791                 GOTO(out, rc = -ENOENT);
1792
1793         LASSERT(osd_invariant(obj));
1794         LASSERT(obj->oo_sa_hdl != NULL);
1795
1796         oh = container_of0(handle, struct osd_thandle, ot_super);
1797
1798         write_lock(&obj->oo_attr_lock);
1799         nlink = ++obj->oo_attr.la_nlink;
1800         write_unlock(&obj->oo_attr_lock);
1801
1802         rc = osd_object_sa_update(obj, SA_ZPL_LINKS(osd), &nlink, 8, oh);
1803
1804 out:
1805         up_read(&obj->oo_guard);
1806         RETURN(rc);
1807 }
1808
1809 static int osd_declare_ref_del(const struct lu_env *env, struct dt_object *dt,
1810                                struct thandle *handle)
1811 {
1812         osd_idc_find_and_init(env, osd_dev(dt->do_lu.lo_dev), osd_dt_obj(dt));
1813         return osd_declare_attr_set(env, dt, NULL, handle);
1814 }
1815
1816 /*
1817  * Concurrency: @dt is write locked.
1818  */
1819 static int osd_ref_del(const struct lu_env *env, struct dt_object *dt,
1820                        struct thandle *handle)
1821 {
1822         struct osd_object       *obj = osd_dt_obj(dt);
1823         struct osd_thandle      *oh;
1824         struct osd_device       *osd = osd_obj2dev(obj);
1825         uint64_t                 nlink;
1826         int                      rc;
1827
1828         ENTRY;
1829
1830         down_read(&obj->oo_guard);
1831
1832         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
1833                 GOTO(out, rc = -ENOENT);
1834
1835         LASSERT(osd_invariant(obj));
1836         LASSERT(obj->oo_sa_hdl != NULL);
1837
1838         oh = container_of0(handle, struct osd_thandle, ot_super);
1839         LASSERT(!lu_object_is_dying(dt->do_lu.lo_header));
1840
1841         write_lock(&obj->oo_attr_lock);
1842         nlink = --obj->oo_attr.la_nlink;
1843         write_unlock(&obj->oo_attr_lock);
1844
1845         rc = osd_object_sa_update(obj, SA_ZPL_LINKS(osd), &nlink, 8, oh);
1846
1847 out:
1848         up_read(&obj->oo_guard);
1849         RETURN(rc);
1850 }
1851
1852 static int osd_object_sync(const struct lu_env *env, struct dt_object *dt,
1853                            __u64 start, __u64 end)
1854 {
1855         struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
1856         ENTRY;
1857
1858         /* XXX: no other option than syncing the whole filesystem until we
1859          * support ZIL.  If the object tracked the txg that it was last
1860          * modified in, it could pass that txg here instead of "0".  Maybe
1861          * the changes are already committed, so no wait is needed at all? */
1862         if (!osd->od_dt_dev.dd_rdonly) {
1863                 if (osd_object_sync_delay_us < 0)
1864                         txg_wait_synced(dmu_objset_pool(osd->od_os), 0ULL);
1865                 else
1866                         udelay(osd_object_sync_delay_us);
1867         }
1868
1869         RETURN(0);
1870 }
1871
1872 static int osd_invalidate(const struct lu_env *env, struct dt_object *dt)
1873 {
1874         return 0;
1875 }
1876
1877 static struct dt_object_operations osd_obj_ops = {
1878         .do_read_lock           = osd_read_lock,
1879         .do_write_lock          = osd_write_lock,
1880         .do_read_unlock         = osd_read_unlock,
1881         .do_write_unlock        = osd_write_unlock,
1882         .do_write_locked        = osd_write_locked,
1883         .do_attr_get            = osd_attr_get,
1884         .do_declare_attr_set    = osd_declare_attr_set,
1885         .do_attr_set            = osd_attr_set,
1886         .do_ah_init             = osd_ah_init,
1887         .do_declare_create      = osd_declare_create,
1888         .do_create              = osd_create,
1889         .do_declare_destroy     = osd_declare_destroy,
1890         .do_destroy             = osd_destroy,
1891         .do_index_try           = osd_index_try,
1892         .do_declare_ref_add     = osd_declare_ref_add,
1893         .do_ref_add             = osd_ref_add,
1894         .do_declare_ref_del     = osd_declare_ref_del,
1895         .do_ref_del             = osd_ref_del,
1896         .do_xattr_get           = osd_xattr_get,
1897         .do_declare_xattr_set   = osd_declare_xattr_set,
1898         .do_xattr_set           = osd_xattr_set,
1899         .do_declare_xattr_del   = osd_declare_xattr_del,
1900         .do_xattr_del           = osd_xattr_del,
1901         .do_xattr_list          = osd_xattr_list,
1902         .do_object_sync         = osd_object_sync,
1903         .do_invalidate          = osd_invalidate,
1904 };
1905
1906 static struct lu_object_operations osd_lu_obj_ops = {
1907         .loo_object_init        = osd_object_init,
1908         .loo_object_delete      = osd_object_delete,
1909         .loo_object_release     = osd_object_release,
1910         .loo_object_free        = osd_object_free,
1911         .loo_object_print       = osd_object_print,
1912         .loo_object_invariant   = osd_object_invariant,
1913 };
1914
1915 static int osd_otable_it_attr_get(const struct lu_env *env,
1916                                 struct dt_object *dt,
1917                                 struct lu_attr *attr)
1918 {
1919         attr->la_valid = 0;
1920         return 0;
1921 }
1922
1923 static struct dt_object_operations osd_obj_otable_it_ops = {
1924         .do_attr_get            = osd_otable_it_attr_get,
1925         .do_index_try           = osd_index_try,
1926 };
1927
1928 module_param(osd_object_sync_delay_us, int, 0644);
1929 MODULE_PARM_DESC(osd_object_sync_delay_us,
1930                  "If zero or larger delay N usec instead of doing object sync");