Whamcloud - gitweb
LU-8068 osd-zfs: large dnode support
[fs/lustre-release.git] / lustre / osd-zfs / osd_object.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2012, 2015, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/osd-zfs/osd_object.c
37  *
38  * Author: Alex Zhuravlev <bzzz@whamcloud.com>
39  * Author: Mike Pershin <tappro@whamcloud.com>
40  * Author: Johann Lombardi <johann@whamcloud.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_OSD
44
45 #include <lustre_ver.h>
46 #include <libcfs/libcfs.h>
47 #include <obd_support.h>
48 #include <lustre_net.h>
49 #include <obd.h>
50 #include <obd_class.h>
51 #include <lustre_disk.h>
52 #include <lustre_fid.h>
53
54 #include "osd_internal.h"
55
56 #include <sys/dnode.h>
57 #include <sys/dbuf.h>
58 #include <sys/spa.h>
59 #include <sys/stat.h>
60 #include <sys/zap.h>
61 #include <sys/spa_impl.h>
62 #include <sys/zfs_znode.h>
63 #include <sys/dmu_tx.h>
64 #include <sys/dmu_objset.h>
65 #include <sys/dsl_prop.h>
66 #include <sys/sa_impl.h>
67 #include <sys/txg.h>
68
69 char *osd_obj_tag = "osd_object";
70
71 static struct dt_object_operations osd_obj_ops;
72 static struct lu_object_operations osd_lu_obj_ops;
73 extern struct dt_body_operations osd_body_ops;
74 static struct dt_object_operations osd_obj_otable_it_ops;
75
76 extern struct kmem_cache *osd_object_kmem;
77
78 static void
79 osd_object_sa_fini(struct osd_object *obj)
80 {
81         if (obj->oo_sa_hdl) {
82                 sa_handle_destroy(obj->oo_sa_hdl);
83                 obj->oo_sa_hdl = NULL;
84         }
85 }
86
87 static int
88 osd_object_sa_init(struct osd_object *obj, struct osd_device *o)
89 {
90         int rc;
91
92         LASSERT(obj->oo_sa_hdl == NULL);
93         LASSERT(obj->oo_db != NULL);
94
95         rc = -sa_handle_get(o->od_os, obj->oo_db->db_object, obj,
96                             SA_HDL_PRIVATE, &obj->oo_sa_hdl);
97         if (rc)
98                 return rc;
99
100         /* Cache the xattr object id, valid for the life of the object */
101         rc = -sa_lookup(obj->oo_sa_hdl, SA_ZPL_XATTR(o), &obj->oo_xattr, 8);
102         if (rc == -ENOENT) {
103                 obj->oo_xattr = ZFS_NO_OBJECT;
104                 rc = 0;
105         } else if (rc) {
106                 osd_object_sa_fini(obj);
107         }
108
109         return rc;
110 }
111
112 /*
113  * Add object to list of dirty objects in tx handle.
114  */
115 static void
116 osd_object_sa_dirty_add(struct osd_object *obj, struct osd_thandle *oh)
117 {
118         if (!list_empty(&obj->oo_sa_linkage))
119                 return;
120
121         down(&oh->ot_sa_lock);
122         write_lock(&obj->oo_attr_lock);
123         if (likely(list_empty(&obj->oo_sa_linkage)))
124                 list_add(&obj->oo_sa_linkage, &oh->ot_sa_list);
125         write_unlock(&obj->oo_attr_lock);
126         up(&oh->ot_sa_lock);
127 }
128
129 /*
130  * Release spill block dbuf hold for all dirty SAs.
131  */
132 void osd_object_sa_dirty_rele(struct osd_thandle *oh)
133 {
134         struct osd_object *obj;
135
136         down(&oh->ot_sa_lock);
137         while (!list_empty(&oh->ot_sa_list)) {
138                 obj = list_entry(oh->ot_sa_list.next,
139                                  struct osd_object, oo_sa_linkage);
140                 sa_spill_rele(obj->oo_sa_hdl);
141                 write_lock(&obj->oo_attr_lock);
142                 list_del_init(&obj->oo_sa_linkage);
143                 write_unlock(&obj->oo_attr_lock);
144         }
145         up(&oh->ot_sa_lock);
146 }
147
148 /*
149  * Update the SA and add the object to the dirty list.
150  */
151 int osd_object_sa_update(struct osd_object *obj, sa_attr_type_t type,
152                          void *buf, uint32_t buflen, struct osd_thandle *oh)
153 {
154         int rc;
155
156         LASSERT(obj->oo_sa_hdl != NULL);
157         LASSERT(oh->ot_tx != NULL);
158
159         rc = -sa_update(obj->oo_sa_hdl, type, buf, buflen, oh->ot_tx);
160         osd_object_sa_dirty_add(obj, oh);
161
162         return rc;
163 }
164
165 /*
166  * Bulk update the SA and add the object to the dirty list.
167  */
168 static int
169 osd_object_sa_bulk_update(struct osd_object *obj, sa_bulk_attr_t *attrs,
170                           int count, struct osd_thandle *oh)
171 {
172         int rc;
173
174         LASSERT(obj->oo_sa_hdl != NULL);
175         LASSERT(oh->ot_tx != NULL);
176
177         rc = -sa_bulk_update(obj->oo_sa_hdl, attrs, count, oh->ot_tx);
178         osd_object_sa_dirty_add(obj, oh);
179
180         return rc;
181 }
182
183 /*
184  * Retrieve the attributes of a DMU object
185  */
186 int __osd_object_attr_get(const struct lu_env *env, struct osd_device *o,
187                           struct osd_object *obj, struct lu_attr *la)
188 {
189         struct osa_attr *osa = &osd_oti_get(env)->oti_osa;
190         sa_handle_t     *sa_hdl;
191         sa_bulk_attr_t  *bulk;
192         int              cnt = 0;
193         int              rc;
194         ENTRY;
195
196         LASSERT(obj->oo_db != NULL);
197
198         rc = -sa_handle_get(o->od_os, obj->oo_db->db_object, NULL,
199                             SA_HDL_PRIVATE, &sa_hdl);
200         if (rc)
201                 RETURN(rc);
202
203         OBD_ALLOC(bulk, sizeof(sa_bulk_attr_t) * 9);
204         if (bulk == NULL)
205                 GOTO(out_sa, rc = -ENOMEM);
206
207         la->la_valid |= LA_ATIME | LA_MTIME | LA_CTIME | LA_MODE | LA_TYPE |
208                         LA_SIZE | LA_UID | LA_GID | LA_FLAGS | LA_NLINK;
209
210         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(o), NULL, osa->atime, 16);
211         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(o), NULL, osa->mtime, 16);
212         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(o), NULL, osa->ctime, 16);
213         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(o), NULL, &osa->mode, 8);
214         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(o), NULL, &osa->size, 8);
215         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(o), NULL, &osa->nlink, 8);
216         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(o), NULL, &osa->uid, 8);
217         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(o), NULL, &osa->gid, 8);
218         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(o), NULL, &osa->flags, 8);
219
220         rc = -sa_bulk_lookup(sa_hdl, bulk, cnt);
221         if (rc)
222                 GOTO(out_bulk, rc);
223
224         la->la_atime = osa->atime[0];
225         la->la_mtime = osa->mtime[0];
226         la->la_ctime = osa->ctime[0];
227         la->la_mode = osa->mode;
228         la->la_uid = osa->uid;
229         la->la_gid = osa->gid;
230         la->la_nlink = osa->nlink;
231         la->la_flags = attrs_zfs2fs(osa->flags);
232         la->la_size = osa->size;
233
234         /* Try to get extra flag from LMA. Right now, only LMAI_ORPHAN
235          * flags is stored in LMA, and it is only for orphan directory */
236         if (S_ISDIR(la->la_mode) && dt_object_exists(&obj->oo_dt)) {
237                 struct osd_thread_info *info = osd_oti_get(env);
238                 struct lustre_mdt_attrs *lma;
239                 struct lu_buf buf;
240
241                 lma = (struct lustre_mdt_attrs *)info->oti_buf;
242                 buf.lb_buf = lma;
243                 buf.lb_len = sizeof(info->oti_buf);
244                 rc = osd_xattr_get(env, &obj->oo_dt, &buf, XATTR_NAME_LMA);
245                 if (rc > 0) {
246                         rc = 0;
247                         lma->lma_incompat = le32_to_cpu(lma->lma_incompat);
248                         obj->oo_lma_flags =
249                                 lma_to_lustre_flags(lma->lma_incompat);
250
251                 } else if (rc == -ENODATA) {
252                         rc = 0;
253                 }
254         }
255
256         if (S_ISCHR(la->la_mode) || S_ISBLK(la->la_mode)) {
257                 rc = -sa_lookup(sa_hdl, SA_ZPL_RDEV(o), &osa->rdev, 8);
258                 if (rc)
259                         GOTO(out_bulk, rc);
260                 la->la_rdev = osa->rdev;
261                 la->la_valid |= LA_RDEV;
262         }
263 out_bulk:
264         OBD_FREE(bulk, sizeof(sa_bulk_attr_t) * 9);
265 out_sa:
266         sa_handle_destroy(sa_hdl);
267
268         RETURN(rc);
269 }
270
271 int __osd_obj2dbuf(const struct lu_env *env, objset_t *os,
272                    uint64_t oid, dmu_buf_t **dbp)
273 {
274         dmu_object_info_t *doi = &osd_oti_get(env)->oti_doi;
275         int rc;
276
277         rc = -sa_buf_hold(os, oid, osd_obj_tag, dbp);
278         if (rc)
279                 return rc;
280
281         dmu_object_info_from_db(*dbp, doi);
282         if (unlikely (oid != DMU_USERUSED_OBJECT &&
283             oid != DMU_GROUPUSED_OBJECT && doi->doi_bonus_type != DMU_OT_SA)) {
284                 sa_buf_rele(*dbp, osd_obj_tag);
285                 *dbp = NULL;
286                 return -EINVAL;
287         }
288
289         LASSERT(*dbp);
290         LASSERT((*dbp)->db_object == oid);
291         LASSERT((*dbp)->db_offset == -1);
292         LASSERT((*dbp)->db_data != NULL);
293
294         return 0;
295 }
296
297 /*
298  * Concurrency: no concurrent access is possible that early in object
299  * life-cycle.
300  */
301 struct lu_object *osd_object_alloc(const struct lu_env *env,
302                                    const struct lu_object_header *hdr,
303                                    struct lu_device *d)
304 {
305         struct osd_object *mo;
306
307         OBD_SLAB_ALLOC_PTR_GFP(mo, osd_object_kmem, GFP_NOFS);
308         if (mo != NULL) {
309                 struct lu_object *l;
310
311                 l = &mo->oo_dt.do_lu;
312                 dt_object_init(&mo->oo_dt, NULL, d);
313                 mo->oo_dt.do_ops = &osd_obj_ops;
314                 l->lo_ops = &osd_lu_obj_ops;
315                 INIT_LIST_HEAD(&mo->oo_sa_linkage);
316                 INIT_LIST_HEAD(&mo->oo_unlinked_linkage);
317                 init_rwsem(&mo->oo_sem);
318                 init_rwsem(&mo->oo_guard);
319                 rwlock_init(&mo->oo_attr_lock);
320                 mo->oo_destroy = OSD_DESTROY_NONE;
321                 return l;
322         } else {
323                 return NULL;
324         }
325 }
326
327 /*
328  * Concurrency: shouldn't matter.
329  */
330 int osd_object_init0(const struct lu_env *env, struct osd_object *obj)
331 {
332         struct osd_device       *osd = osd_obj2dev(obj);
333         const struct lu_fid     *fid = lu_object_fid(&obj->oo_dt.do_lu);
334         int                      rc = 0;
335         ENTRY;
336
337         if (obj->oo_db == NULL)
338                 RETURN(0);
339
340         /* object exist */
341
342         rc = osd_object_sa_init(obj, osd);
343         if (rc)
344                 RETURN(rc);
345
346         /* cache attrs in object */
347         rc = __osd_object_attr_get(env, osd, obj, &obj->oo_attr);
348         if (rc)
349                 RETURN(rc);
350
351         if (likely(!fid_is_acct(fid)))
352                 /* no body operations for accounting objects */
353                 obj->oo_dt.do_body_ops = &osd_body_ops;
354
355         /*
356          * initialize object before marking it existing
357          */
358         obj->oo_dt.do_lu.lo_header->loh_attr |= obj->oo_attr.la_mode & S_IFMT;
359
360         smp_mb();
361         obj->oo_dt.do_lu.lo_header->loh_attr |= LOHA_EXISTS;
362
363         RETURN(0);
364 }
365
366 static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
367 {
368         struct osd_thread_info  *info = osd_oti_get(env);
369         struct lu_buf           buf;
370         int                     rc;
371         struct lustre_mdt_attrs *lma;
372         ENTRY;
373
374         CLASSERT(sizeof(info->oti_buf) >= sizeof(*lma));
375         lma = (struct lustre_mdt_attrs *)info->oti_buf;
376         buf.lb_buf = lma;
377         buf.lb_len = sizeof(info->oti_buf);
378
379         rc = osd_xattr_get(env, &obj->oo_dt, &buf, XATTR_NAME_LMA);
380         if (rc > 0) {
381                 rc = 0;
382                 lustre_lma_swab(lma);
383                 if (unlikely((lma->lma_incompat & ~LMA_INCOMPAT_SUPP) ||
384                              CFS_FAIL_CHECK(OBD_FAIL_OSD_LMA_INCOMPAT))) {
385                         CWARN("%s: unsupported incompat LMA feature(s) %#x for "
386                               "fid = "DFID"\n", osd_obj2dev(obj)->od_svname,
387                               lma->lma_incompat & ~LMA_INCOMPAT_SUPP,
388                               PFID(lu_object_fid(&obj->oo_dt.do_lu)));
389                         rc = -EOPNOTSUPP;
390                 }
391         } else if (rc == -ENODATA) {
392                 /* haven't initialize LMA xattr */
393                 rc = 0;
394         }
395
396         RETURN(rc);
397 }
398
399 /*
400  * Concurrency: no concurrent access is possible that early in object
401  * life-cycle.
402  */
403 static int osd_object_init(const struct lu_env *env, struct lu_object *l,
404                            const struct lu_object_conf *conf)
405 {
406         struct osd_object       *obj = osd_obj(l);
407         struct osd_device       *osd = osd_obj2dev(obj);
408         uint64_t                 oid;
409         int                      rc;
410         ENTRY;
411
412         LASSERT(osd_invariant(obj));
413
414         if (fid_is_otable_it(&l->lo_header->loh_fid)) {
415                 obj->oo_dt.do_ops = &osd_obj_otable_it_ops;
416                 l->lo_header->loh_attr |= LOHA_EXISTS;
417                 RETURN(0);
418         }
419
420         if (conf != NULL && conf->loc_flags & LOC_F_NEW)
421                 GOTO(out, rc = 0);
422
423         rc = osd_fid_lookup(env, osd, lu_object_fid(l), &oid);
424         if (rc == 0) {
425                 LASSERT(obj->oo_db == NULL);
426                 rc = __osd_obj2dbuf(env, osd->od_os, oid, &obj->oo_db);
427                 /* EEXIST will be returned if object is being deleted in ZFS */
428                 if (rc == -EEXIST) {
429                         rc = 0;
430                         GOTO(out, rc);
431                 }
432                 if (rc != 0) {
433                         CERROR("%s: lookup "DFID"/"LPX64" failed: rc = %d\n",
434                                osd->od_svname, PFID(lu_object_fid(l)), oid, rc);
435                         GOTO(out, rc);
436                 }
437                 LASSERT(obj->oo_db);
438                 rc = osd_object_init0(env, obj);
439                 if (rc != 0)
440                         GOTO(out, rc);
441
442                 rc = osd_check_lma(env, obj);
443                 if (rc != 0)
444                         GOTO(out, rc);
445         } else if (rc == -ENOENT) {
446                 rc = 0;
447         }
448         LASSERT(osd_invariant(obj));
449 out:
450         RETURN(rc);
451 }
452
453 /*
454  * Concurrency: no concurrent access is possible that late in object
455  * life-cycle.
456  */
457 static void osd_object_free(const struct lu_env *env, struct lu_object *l)
458 {
459         struct osd_object *obj = osd_obj(l);
460
461         LASSERT(osd_invariant(obj));
462
463         dt_object_fini(&obj->oo_dt);
464         OBD_SLAB_FREE_PTR(obj, osd_object_kmem);
465 }
466
467 static int
468 osd_object_unlinked_add(struct osd_object *obj, struct osd_thandle *oh)
469 {
470         int rc = -EBUSY;
471
472         LASSERT(obj->oo_destroy == OSD_DESTROY_ASYNC);
473
474         /* the object is supposed to be exclusively locked by
475          * the caller (osd_object_destroy()), while the transaction
476          * (oh) is per-thread and not shared */
477         if (likely(list_empty(&obj->oo_unlinked_linkage))) {
478                 list_add(&obj->oo_unlinked_linkage, &oh->ot_unlinked_list);
479                 rc = 0;
480         }
481
482         return rc;
483 }
484
485 /* Default to max data size covered by a level-1 indirect block */
486 static unsigned long osd_sync_destroy_max_size =
487         1UL << (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT + SPA_MAXBLOCKSHIFT);
488 module_param(osd_sync_destroy_max_size, ulong, 0444);
489 MODULE_PARM_DESC(osd_sync_destroy_max_size, "Maximum object size to use synchronous destroy.");
490
491 static inline void
492 osd_object_set_destroy_type(struct osd_object *obj)
493 {
494         /*
495          * Lock-less OST_WRITE can race with OST_DESTROY, so set destroy type
496          * only once and use it consistently thereafter.
497          */
498         down_write(&obj->oo_guard);
499         if (obj->oo_destroy == OSD_DESTROY_NONE) {
500                 if (obj->oo_attr.la_size <= osd_sync_destroy_max_size)
501                         obj->oo_destroy = OSD_DESTROY_SYNC;
502                 else /* Larger objects are destroyed asynchronously */
503                         obj->oo_destroy = OSD_DESTROY_ASYNC;
504         }
505         up_write(&obj->oo_guard);
506 }
507
508 static int osd_declare_object_destroy(const struct lu_env *env,
509                                       struct dt_object *dt,
510                                       struct thandle *th)
511 {
512         char                    *buf = osd_oti_get(env)->oti_str;
513         const struct lu_fid     *fid = lu_object_fid(&dt->do_lu);
514         struct osd_object       *obj = osd_dt_obj(dt);
515         struct osd_device       *osd = osd_obj2dev(obj);
516         struct osd_thandle      *oh;
517         int                      rc;
518         uint64_t                 zapid;
519         ENTRY;
520
521         LASSERT(th != NULL);
522         LASSERT(dt_object_exists(dt));
523
524         oh = container_of0(th, struct osd_thandle, ot_super);
525         LASSERT(oh->ot_tx != NULL);
526
527         /* declare that we'll remove object from fid-dnode mapping */
528         zapid = osd_get_name_n_idx(env, osd, fid, buf);
529         dmu_tx_hold_bonus(oh->ot_tx, zapid);
530         dmu_tx_hold_zap(oh->ot_tx, zapid, FALSE, buf);
531
532         osd_declare_xattrs_destroy(env, obj, oh);
533
534         /* declare that we'll remove object from inode accounting ZAPs */
535         dmu_tx_hold_bonus(oh->ot_tx, osd->od_iusr_oid);
536         dmu_tx_hold_zap(oh->ot_tx, osd->od_iusr_oid, FALSE, buf);
537         dmu_tx_hold_bonus(oh->ot_tx, osd->od_igrp_oid);
538         dmu_tx_hold_zap(oh->ot_tx, osd->od_igrp_oid, FALSE, buf);
539
540         /* one less inode */
541         rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
542                                obj->oo_attr.la_gid, -1, oh, false, NULL, false);
543         if (rc)
544                 RETURN(rc);
545
546         /* data to be truncated */
547         rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
548                                obj->oo_attr.la_gid, 0, oh, true, NULL, false);
549         if (rc)
550                 RETURN(rc);
551
552         osd_object_set_destroy_type(obj);
553         if (obj->oo_destroy == OSD_DESTROY_SYNC)
554                 dmu_tx_hold_free(oh->ot_tx, obj->oo_db->db_object,
555                                  0, DMU_OBJECT_END);
556         else
557                 dmu_tx_hold_zap(oh->ot_tx, osd->od_unlinkedid, TRUE, NULL);
558
559         RETURN(0);
560 }
561
562 static int osd_object_destroy(const struct lu_env *env,
563                               struct dt_object *dt, struct thandle *th)
564 {
565         char                    *buf = osd_oti_get(env)->oti_str;
566         struct osd_object       *obj = osd_dt_obj(dt);
567         struct osd_device       *osd = osd_obj2dev(obj);
568         const struct lu_fid     *fid = lu_object_fid(&dt->do_lu);
569         struct osd_thandle      *oh;
570         int                      rc;
571         uint64_t                 oid, zapid;
572         ENTRY;
573
574         down_write(&obj->oo_guard);
575
576         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
577                 GOTO(out, rc = -ENOENT);
578
579         LASSERT(obj->oo_db != NULL);
580
581         oh = container_of0(th, struct osd_thandle, ot_super);
582         LASSERT(oh != NULL);
583         LASSERT(oh->ot_tx != NULL);
584
585         /* remove obj ref from index dir (it depends) */
586         zapid = osd_get_name_n_idx(env, osd, fid, buf);
587         rc = -zap_remove(osd->od_os, zapid, buf, oh->ot_tx);
588         if (rc) {
589                 CERROR("%s: zap_remove(%s) failed: rc = %d\n",
590                        osd->od_svname, buf, rc);
591                 GOTO(out, rc);
592         }
593
594         rc = osd_xattrs_destroy(env, obj, oh);
595         if (rc) {
596                 CERROR("%s: cannot destroy xattrs for %s: rc = %d\n",
597                        osd->od_svname, buf, rc);
598                 GOTO(out, rc);
599         }
600
601         /* Remove object from inode accounting. It is not fatal for the destroy
602          * operation if something goes wrong while updating accounting, but we
603          * still log an error message to notify the administrator */
604         rc = -zap_increment_int(osd->od_os, osd->od_iusr_oid,
605                                 obj->oo_attr.la_uid, -1, oh->ot_tx);
606         if (rc)
607                 CERROR("%s: failed to remove "DFID" from accounting ZAP for usr"
608                        " %d: rc = %d\n", osd->od_svname, PFID(fid),
609                        obj->oo_attr.la_uid, rc);
610         rc = -zap_increment_int(osd->od_os, osd->od_igrp_oid,
611                                 obj->oo_attr.la_gid, -1, oh->ot_tx);
612         if (rc)
613                 CERROR("%s: failed to remove "DFID" from accounting ZAP for grp"
614                        " %d: rc = %d\n", osd->od_svname, PFID(fid),
615                        obj->oo_attr.la_gid, rc);
616
617         oid = obj->oo_db->db_object;
618         if (unlikely(obj->oo_destroy == OSD_DESTROY_NONE)) {
619                 /* this may happen if the destroy wasn't declared
620                  * e.g. when the object is created and then destroyed
621                  * in the same transaction - we don't need additional
622                  * space for destroy specifically */
623                 LASSERT(obj->oo_attr.la_size <= osd_sync_destroy_max_size);
624                 rc = -dmu_object_free(osd->od_os, oid, oh->ot_tx);
625                 if (rc)
626                         CERROR("%s: failed to free %s "LPU64": rc = %d\n",
627                                osd->od_svname, buf, oid, rc);
628         } else if (obj->oo_destroy == OSD_DESTROY_SYNC) {
629                 rc = -dmu_object_free(osd->od_os, oid, oh->ot_tx);
630                 if (rc)
631                         CERROR("%s: failed to free %s "LPU64": rc = %d\n",
632                                osd->od_svname, buf, oid, rc);
633         } else { /* asynchronous destroy */
634                 rc = osd_object_unlinked_add(obj, oh);
635                 if (rc)
636                         GOTO(out, rc);
637
638                 rc = -zap_add_int(osd->od_os, osd->od_unlinkedid,
639                                   oid, oh->ot_tx);
640                 if (rc)
641                         CERROR("%s: zap_add_int() failed %s "LPU64": rc = %d\n",
642                                osd->od_svname, buf, oid, rc);
643         }
644
645 out:
646         /* not needed in the cache anymore */
647         set_bit(LU_OBJECT_HEARD_BANSHEE, &dt->do_lu.lo_header->loh_flags);
648         if (rc == 0)
649                 obj->oo_destroyed = 1;
650         up_write(&obj->oo_guard);
651         RETURN (0);
652 }
653
654 static void osd_object_delete(const struct lu_env *env, struct lu_object *l)
655 {
656         struct osd_object *obj = osd_obj(l);
657
658         if (obj->oo_db != NULL) {
659                 osd_object_sa_fini(obj);
660                 if (obj->oo_sa_xattr) {
661                         nvlist_free(obj->oo_sa_xattr);
662                         obj->oo_sa_xattr = NULL;
663                 }
664                 sa_buf_rele(obj->oo_db, osd_obj_tag);
665                 list_del(&obj->oo_sa_linkage);
666                 obj->oo_db = NULL;
667         }
668 }
669
670 /*
671  * Concurrency: ->loo_object_release() is called under site spin-lock.
672  */
673 static void osd_object_release(const struct lu_env *env,
674                                struct lu_object *l)
675 {
676 }
677
678 /*
679  * Concurrency: shouldn't matter.
680  */
681 static int osd_object_print(const struct lu_env *env, void *cookie,
682                             lu_printer_t p, const struct lu_object *l)
683 {
684         struct osd_object *o = osd_obj(l);
685
686         return (*p)(env, cookie, LUSTRE_OSD_ZFS_NAME"-object@%p", o);
687 }
688
689 static void osd_object_read_lock(const struct lu_env *env,
690                                  struct dt_object *dt, unsigned role)
691 {
692         struct osd_object *obj = osd_dt_obj(dt);
693
694         LASSERT(osd_invariant(obj));
695
696         down_read_nested(&obj->oo_sem, role);
697 }
698
699 static void osd_object_write_lock(const struct lu_env *env,
700                                   struct dt_object *dt, unsigned role)
701 {
702         struct osd_object *obj = osd_dt_obj(dt);
703
704         LASSERT(osd_invariant(obj));
705
706         down_write_nested(&obj->oo_sem, role);
707 }
708
709 static void osd_object_read_unlock(const struct lu_env *env,
710                                    struct dt_object *dt)
711 {
712         struct osd_object *obj = osd_dt_obj(dt);
713
714         LASSERT(osd_invariant(obj));
715         up_read(&obj->oo_sem);
716 }
717
718 static void osd_object_write_unlock(const struct lu_env *env,
719                                     struct dt_object *dt)
720 {
721         struct osd_object *obj = osd_dt_obj(dt);
722
723         LASSERT(osd_invariant(obj));
724         up_write(&obj->oo_sem);
725 }
726
727 static int osd_object_write_locked(const struct lu_env *env,
728                                    struct dt_object *dt)
729 {
730         struct osd_object *obj = osd_dt_obj(dt);
731         int rc = 1;
732
733         LASSERT(osd_invariant(obj));
734
735         if (down_write_trylock(&obj->oo_sem)) {
736                 rc = 0;
737                 up_write(&obj->oo_sem);
738         }
739         return rc;
740 }
741
742 static int osd_attr_get(const struct lu_env *env,
743                         struct dt_object *dt,
744                         struct lu_attr *attr)
745 {
746         struct osd_object       *obj = osd_dt_obj(dt);
747         uint64_t                 blocks;
748         uint32_t                 blksize;
749         int                      rc = 0;
750
751         down_read(&obj->oo_guard);
752
753         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
754                 GOTO(out, rc = -ENOENT);
755
756         LASSERT(osd_invariant(obj));
757         LASSERT(obj->oo_db);
758
759         read_lock(&obj->oo_attr_lock);
760         *attr = obj->oo_attr;
761         if (obj->oo_lma_flags & LUSTRE_ORPHAN_FL)
762                 attr->la_flags |= LUSTRE_ORPHAN_FL;
763         read_unlock(&obj->oo_attr_lock);
764
765         /* with ZFS_DEBUG zrl_add_debug() called by DB_DNODE_ENTER()
766          * from within sa_object_size() can block on a mutex, so
767          * we can't call sa_object_size() holding rwlock */
768         sa_object_size(obj->oo_sa_hdl, &blksize, &blocks);
769         /* we do not control size of indices, so always calculate
770          * it from number of blocks reported by DMU */
771         if (S_ISDIR(attr->la_mode))
772                 attr->la_size = 512 * blocks;
773         /* Block size may be not set; suggest maximal I/O transfers. */
774         if (blksize == 0)
775                 blksize = osd_spa_maxblocksize(
776                         dmu_objset_spa(osd_obj2dev(obj)->od_os));
777
778         attr->la_blksize = blksize;
779         attr->la_blocks = blocks;
780         attr->la_valid |= LA_BLOCKS | LA_BLKSIZE;
781
782 out:
783         up_read(&obj->oo_guard);
784         return rc;
785 }
786
787 /* Simple wrapper on top of qsd API which implement quota transfer for osd
788  * setattr needs. As a reminder, only the root user can change ownership of
789  * a file, that's why EDQUOT & EINPROGRESS errors are discarded */
790 static inline int qsd_transfer(const struct lu_env *env,
791                                struct qsd_instance *qsd,
792                                struct lquota_trans *trans, int qtype,
793                                __u64 orig_id, __u64 new_id, __u64 bspace,
794                                struct lquota_id_info *qi)
795 {
796         int     rc;
797
798         if (unlikely(qsd == NULL))
799                 return 0;
800
801         LASSERT(qtype >= 0 && qtype < LL_MAXQUOTAS);
802         qi->lqi_type = qtype;
803
804         /* inode accounting */
805         qi->lqi_is_blk = false;
806
807         /* one more inode for the new owner ... */
808         qi->lqi_id.qid_uid = new_id;
809         qi->lqi_space      = 1;
810         rc = qsd_op_begin(env, qsd, trans, qi, NULL);
811         if (rc == -EDQUOT || rc == -EINPROGRESS)
812                 rc = 0;
813         if (rc)
814                 return rc;
815
816         /* and one less inode for the current id */
817         qi->lqi_id.qid_uid = orig_id;;
818         qi->lqi_space      = -1;
819         /* can't get EDQUOT when reducing usage */
820         rc = qsd_op_begin(env, qsd, trans, qi, NULL);
821         if (rc == -EINPROGRESS)
822                 rc = 0;
823         if (rc)
824                 return rc;
825
826         /* block accounting */
827         qi->lqi_is_blk = true;
828
829         /* more blocks for the new owner ... */
830         qi->lqi_id.qid_uid = new_id;
831         qi->lqi_space      = bspace;
832         rc = qsd_op_begin(env, qsd, trans, qi, NULL);
833         if (rc == -EDQUOT || rc == -EINPROGRESS)
834                 rc = 0;
835         if (rc)
836                 return rc;
837
838         /* and finally less blocks for the current owner */
839         qi->lqi_id.qid_uid = orig_id;
840         qi->lqi_space      = -bspace;
841         rc = qsd_op_begin(env, qsd, trans, qi, NULL);
842         /* can't get EDQUOT when reducing usage */
843         if (rc == -EINPROGRESS)
844                 rc = 0;
845         return rc;
846 }
847
848 static int osd_declare_attr_set(const struct lu_env *env,
849                                 struct dt_object *dt,
850                                 const struct lu_attr *attr,
851                                 struct thandle *handle)
852 {
853         struct osd_thread_info  *info = osd_oti_get(env);
854         char                    *buf = osd_oti_get(env)->oti_str;
855         struct osd_object       *obj = osd_dt_obj(dt);
856         struct osd_device       *osd = osd_obj2dev(obj);
857         struct osd_thandle      *oh;
858         uint64_t                 bspace;
859         uint32_t                 blksize;
860         int                      rc = 0;
861         ENTRY;
862
863
864         LASSERT(handle != NULL);
865         LASSERT(osd_invariant(obj));
866
867         oh = container_of0(handle, struct osd_thandle, ot_super);
868
869         down_read(&obj->oo_guard);
870         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
871                 GOTO(out, rc = 0);
872
873         LASSERT(obj->oo_sa_hdl != NULL);
874         LASSERT(oh->ot_tx != NULL);
875         dmu_tx_hold_sa(oh->ot_tx, obj->oo_sa_hdl, 0);
876         if (oh->ot_tx->tx_err != 0)
877                 GOTO(out, rc = -oh->ot_tx->tx_err);
878
879         sa_object_size(obj->oo_sa_hdl, &blksize, &bspace);
880         bspace = toqb(bspace * blksize);
881
882         __osd_xattr_declare_set(env, obj, sizeof(struct lustre_mdt_attrs),
883                                 XATTR_NAME_LMA, oh);
884
885         if (attr && attr->la_valid & LA_UID) {
886                 /* account for user inode tracking ZAP update */
887                 dmu_tx_hold_bonus(oh->ot_tx, osd->od_iusr_oid);
888                 dmu_tx_hold_zap(oh->ot_tx, osd->od_iusr_oid, TRUE, buf);
889
890                 /* quota enforcement for user */
891                 if (attr->la_uid != obj->oo_attr.la_uid) {
892                         rc = qsd_transfer(env, osd->od_quota_slave,
893                                           &oh->ot_quota_trans, USRQUOTA,
894                                           obj->oo_attr.la_uid, attr->la_uid,
895                                           bspace, &info->oti_qi);
896                         if (rc)
897                                 GOTO(out, rc);
898                 }
899         }
900         if (attr && attr->la_valid & LA_GID) {
901                 /* account for user inode tracking ZAP update */
902                 dmu_tx_hold_bonus(oh->ot_tx, osd->od_igrp_oid);
903                 dmu_tx_hold_zap(oh->ot_tx, osd->od_igrp_oid, TRUE, buf);
904
905                 /* quota enforcement for group */
906                 if (attr->la_gid != obj->oo_attr.la_gid) {
907                         rc = qsd_transfer(env, osd->od_quota_slave,
908                                           &oh->ot_quota_trans, GRPQUOTA,
909                                           obj->oo_attr.la_gid, attr->la_gid,
910                                           bspace, &info->oti_qi);
911                         if (rc)
912                                 GOTO(out, rc);
913                 }
914         }
915
916 out:
917         up_read(&obj->oo_guard);
918         RETURN(rc);
919 }
920
921 /*
922  * Set the attributes of an object
923  *
924  * The transaction passed to this routine must have
925  * dmu_tx_hold_bonus(tx, oid) called and then assigned
926  * to a transaction group.
927  */
928 static int osd_attr_set(const struct lu_env *env, struct dt_object *dt,
929                         const struct lu_attr *la, struct thandle *handle)
930 {
931         struct osd_thread_info  *info = osd_oti_get(env);
932         struct osd_object       *obj = osd_dt_obj(dt);
933         struct osd_device       *osd = osd_obj2dev(obj);
934         struct osd_thandle      *oh;
935         struct osa_attr         *osa = &info->oti_osa;
936         sa_bulk_attr_t          *bulk;
937         __u64                    valid = la->la_valid;
938         int                      cnt;
939         int                      rc = 0;
940
941         ENTRY;
942
943         down_read(&obj->oo_guard);
944         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
945                 GOTO(out, rc = -ENOENT);
946
947         LASSERT(handle != NULL);
948         LASSERT(osd_invariant(obj));
949         LASSERT(obj->oo_sa_hdl);
950
951         oh = container_of0(handle, struct osd_thandle, ot_super);
952         /* Assert that the transaction has been assigned to a
953            transaction group. */
954         LASSERT(oh->ot_tx->tx_txg != 0);
955
956         /* Only allow set size for regular file */
957         if (!S_ISREG(dt->do_lu.lo_header->loh_attr))
958                 valid &= ~(LA_SIZE | LA_BLOCKS);
959
960         if (valid & LA_CTIME && la->la_ctime == obj->oo_attr.la_ctime)
961                 valid &= ~LA_CTIME;
962
963         if (valid & LA_MTIME && la->la_mtime == obj->oo_attr.la_mtime)
964                 valid &= ~LA_MTIME;
965
966         if (valid & LA_ATIME && la->la_atime == obj->oo_attr.la_atime)
967                 valid &= ~LA_ATIME;
968
969         if (valid == 0)
970                 GOTO(out, rc = 0);
971
972         if (valid & LA_FLAGS) {
973                 struct lustre_mdt_attrs *lma;
974                 struct lu_buf buf;
975
976                 if (la->la_flags & LUSTRE_LMA_FL_MASKS) {
977                         CLASSERT(sizeof(info->oti_buf) >= sizeof(*lma));
978                         lma = (struct lustre_mdt_attrs *)&info->oti_buf;
979                         buf.lb_buf = lma;
980                         buf.lb_len = sizeof(info->oti_buf);
981                         rc = osd_xattr_get(env, &obj->oo_dt, &buf,
982                                            XATTR_NAME_LMA);
983                         if (rc > 0) {
984                                 lma->lma_incompat =
985                                         le32_to_cpu(lma->lma_incompat);
986                                 lma->lma_incompat |=
987                                         lustre_to_lma_flags(la->la_flags);
988                                 lma->lma_incompat =
989                                         cpu_to_le32(lma->lma_incompat);
990                                 buf.lb_buf = lma;
991                                 buf.lb_len = sizeof(*lma);
992                                 rc = osd_xattr_set_internal(env, obj, &buf,
993                                                             XATTR_NAME_LMA,
994                                                             LU_XATTR_REPLACE,
995                                                             oh);
996                         }
997                         if (rc < 0) {
998                                 CWARN("%s: failed to set LMA flags: rc = %d\n",
999                                        osd->od_svname, rc);
1000                                 RETURN(rc);
1001                         }
1002                 }
1003         }
1004
1005         OBD_ALLOC(bulk, sizeof(sa_bulk_attr_t) * 10);
1006         if (bulk == NULL)
1007                 GOTO(out, rc = -ENOMEM);
1008
1009         /* do both accounting updates outside oo_attr_lock below */
1010         if ((valid & LA_UID) && (la->la_uid != obj->oo_attr.la_uid)) {
1011                 /* Update user accounting. Failure isn't fatal, but we still
1012                  * log an error message */
1013                 rc = -zap_increment_int(osd->od_os, osd->od_iusr_oid,
1014                                         la->la_uid, 1, oh->ot_tx);
1015                 if (rc)
1016                         CERROR("%s: failed to update accounting ZAP for user "
1017                                 "%d (%d)\n", osd->od_svname, la->la_uid, rc);
1018                 rc = -zap_increment_int(osd->od_os, osd->od_iusr_oid,
1019                                         obj->oo_attr.la_uid, -1, oh->ot_tx);
1020                 if (rc)
1021                         CERROR("%s: failed to update accounting ZAP for user "
1022                                 "%d (%d)\n", osd->od_svname,
1023                                 obj->oo_attr.la_uid, rc);
1024         }
1025         if ((valid & LA_GID) && (la->la_gid != obj->oo_attr.la_gid)) {
1026                 /* Update group accounting. Failure isn't fatal, but we still
1027                  * log an error message */
1028                 rc = -zap_increment_int(osd->od_os, osd->od_igrp_oid,
1029                                         la->la_gid, 1, oh->ot_tx);
1030                 if (rc)
1031                         CERROR("%s: failed to update accounting ZAP for user "
1032                                 "%d (%d)\n", osd->od_svname, la->la_gid, rc);
1033                 rc = -zap_increment_int(osd->od_os, osd->od_igrp_oid,
1034                                         obj->oo_attr.la_gid, -1, oh->ot_tx);
1035                 if (rc)
1036                         CERROR("%s: failed to update accounting ZAP for user "
1037                                 "%d (%d)\n", osd->od_svname,
1038                                 obj->oo_attr.la_gid, rc);
1039         }
1040
1041         write_lock(&obj->oo_attr_lock);
1042         cnt = 0;
1043         if (valid & LA_ATIME) {
1044                 osa->atime[0] = obj->oo_attr.la_atime = la->la_atime;
1045                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(osd), NULL,
1046                                  osa->atime, 16);
1047         }
1048         if (valid & LA_MTIME) {
1049                 osa->mtime[0] = obj->oo_attr.la_mtime = la->la_mtime;
1050                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(osd), NULL,
1051                                  osa->mtime, 16);
1052         }
1053         if (valid & LA_CTIME) {
1054                 osa->ctime[0] = obj->oo_attr.la_ctime = la->la_ctime;
1055                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(osd), NULL,
1056                                  osa->ctime, 16);
1057         }
1058         if (valid & LA_MODE) {
1059                 /* mode is stored along with type, so read it first */
1060                 obj->oo_attr.la_mode = (obj->oo_attr.la_mode & S_IFMT) |
1061                         (la->la_mode & ~S_IFMT);
1062                 osa->mode = obj->oo_attr.la_mode;
1063                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(osd), NULL,
1064                                  &osa->mode, 8);
1065         }
1066         if (valid & LA_SIZE) {
1067                 osa->size = obj->oo_attr.la_size = la->la_size;
1068                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(osd), NULL,
1069                                  &osa->size, 8);
1070         }
1071         if (valid & LA_NLINK) {
1072                 osa->nlink = obj->oo_attr.la_nlink = la->la_nlink;
1073                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(osd), NULL,
1074                                  &osa->nlink, 8);
1075         }
1076         if (valid & LA_RDEV) {
1077                 osa->rdev = obj->oo_attr.la_rdev = la->la_rdev;
1078                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(osd), NULL,
1079                                  &osa->rdev, 8);
1080         }
1081         if (valid & LA_FLAGS) {
1082                 osa->flags = attrs_fs2zfs(la->la_flags);
1083                 /* many flags are not supported by zfs, so ensure a good cached
1084                  * copy */
1085                 obj->oo_attr.la_flags = attrs_zfs2fs(osa->flags);
1086                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(osd), NULL,
1087                                  &osa->flags, 8);
1088         }
1089         if (valid & LA_UID) {
1090                 osa->uid = obj->oo_attr.la_uid = la->la_uid;
1091                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(osd), NULL,
1092                                  &osa->uid, 8);
1093         }
1094         if (valid & LA_GID) {
1095                 osa->gid = obj->oo_attr.la_gid = la->la_gid;
1096                 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(osd), NULL,
1097                                  &osa->gid, 8);
1098         }
1099         obj->oo_attr.la_valid |= valid;
1100         write_unlock(&obj->oo_attr_lock);
1101
1102         rc = osd_object_sa_bulk_update(obj, bulk, cnt, oh);
1103
1104         OBD_FREE(bulk, sizeof(sa_bulk_attr_t) * 10);
1105 out:
1106         up_read(&obj->oo_guard);
1107         RETURN(rc);
1108 }
1109
1110 /*
1111  * Object creation.
1112  *
1113  * XXX temporary solution.
1114  */
1115
1116 static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah,
1117                         struct dt_object *parent, struct dt_object *child,
1118                         umode_t child_mode)
1119 {
1120         LASSERT(ah);
1121
1122         ah->dah_parent = parent;
1123         ah->dah_mode = child_mode;
1124 }
1125
1126 static int osd_declare_object_create(const struct lu_env *env,
1127                                      struct dt_object *dt,
1128                                      struct lu_attr *attr,
1129                                      struct dt_allocation_hint *hint,
1130                                      struct dt_object_format *dof,
1131                                      struct thandle *handle)
1132 {
1133         char                    *buf = osd_oti_get(env)->oti_str;
1134         const struct lu_fid     *fid = lu_object_fid(&dt->do_lu);
1135         struct osd_object       *obj = osd_dt_obj(dt);
1136         struct osd_device       *osd = osd_obj2dev(obj);
1137         struct osd_thandle      *oh;
1138         uint64_t                 zapid;
1139         int                      rc;
1140         ENTRY;
1141
1142         LASSERT(dof);
1143
1144         switch (dof->dof_type) {
1145                 case DFT_REGULAR:
1146                 case DFT_SYM:
1147                 case DFT_NODE:
1148                         if (obj->oo_dt.do_body_ops == NULL)
1149                                 obj->oo_dt.do_body_ops = &osd_body_ops;
1150                         break;
1151                 default:
1152                         break;
1153         }
1154
1155         LASSERT(handle != NULL);
1156         oh = container_of0(handle, struct osd_thandle, ot_super);
1157         LASSERT(oh->ot_tx != NULL);
1158
1159         switch (dof->dof_type) {
1160                 case DFT_DIR:
1161                         dt->do_index_ops = &osd_dir_ops;
1162                 case DFT_INDEX:
1163                         /* for zap create */
1164                         dmu_tx_hold_zap(oh->ot_tx, DMU_NEW_OBJECT, 1, NULL);
1165                         break;
1166                 case DFT_REGULAR:
1167                 case DFT_SYM:
1168                 case DFT_NODE:
1169                         /* first, we'll create new object */
1170                         dmu_tx_hold_bonus(oh->ot_tx, DMU_NEW_OBJECT);
1171                         break;
1172
1173                 default:
1174                         LBUG();
1175                         break;
1176         }
1177
1178         /* and we'll add it to some mapping */
1179         zapid = osd_get_name_n_idx(env, osd, fid, buf);
1180         dmu_tx_hold_bonus(oh->ot_tx, zapid);
1181         dmu_tx_hold_zap(oh->ot_tx, zapid, TRUE, buf);
1182
1183         /* we will also update inode accounting ZAPs */
1184         dmu_tx_hold_bonus(oh->ot_tx, osd->od_iusr_oid);
1185         dmu_tx_hold_zap(oh->ot_tx, osd->od_iusr_oid, TRUE, buf);
1186         dmu_tx_hold_bonus(oh->ot_tx, osd->od_igrp_oid);
1187         dmu_tx_hold_zap(oh->ot_tx, osd->od_igrp_oid, TRUE, buf);
1188
1189         dmu_tx_hold_sa_create(oh->ot_tx, ZFS_SA_BASE_ATTR_SIZE);
1190
1191         __osd_xattr_declare_set(env, obj, sizeof(struct lustre_mdt_attrs),
1192                                 XATTR_NAME_LMA, oh);
1193
1194         rc = osd_declare_quota(env, osd, attr->la_uid, attr->la_gid, 1, oh,
1195                                false, NULL, false);
1196         RETURN(rc);
1197 }
1198
1199 int __osd_attr_init(const struct lu_env *env, struct osd_device *osd,
1200                     uint64_t oid, dmu_tx_t *tx, struct lu_attr *la,
1201                     uint64_t parent)
1202 {
1203         sa_bulk_attr_t  *bulk;
1204         sa_handle_t     *sa_hdl;
1205         struct osa_attr *osa = &osd_oti_get(env)->oti_osa;
1206         uint64_t         gen;
1207         uint64_t         crtime[2];
1208         timestruc_t      now;
1209         int              cnt;
1210         int              rc;
1211
1212         gethrestime(&now);
1213         gen = dmu_tx_get_txg(tx);
1214
1215         ZFS_TIME_ENCODE(&now, crtime);
1216
1217         osa->atime[0] = la->la_atime;
1218         osa->ctime[0] = la->la_ctime;
1219         osa->mtime[0] = la->la_mtime;
1220         osa->mode = la->la_mode;
1221         osa->uid = la->la_uid;
1222         osa->gid = la->la_gid;
1223         osa->rdev = la->la_rdev;
1224         osa->nlink = la->la_nlink;
1225         osa->flags = attrs_fs2zfs(la->la_flags);
1226         osa->size  = la->la_size;
1227
1228         /* Now add in all of the "SA" attributes */
1229         rc = -sa_handle_get(osd->od_os, oid, NULL, SA_HDL_PRIVATE, &sa_hdl);
1230         if (rc)
1231                 return rc;
1232
1233         OBD_ALLOC(bulk, sizeof(sa_bulk_attr_t) * 13);
1234         if (bulk == NULL) {
1235                 rc = -ENOMEM;
1236                 goto out;
1237         }
1238         /*
1239          * we need to create all SA below upon object create.
1240          *
1241          * XXX The attribute order matters since the accounting callback relies
1242          * on static offsets (i.e. SA_*_OFFSET, see zfs_space_delta_cb()) to
1243          * look up the UID/GID attributes. Moreover, the callback does not seem
1244          * to support the spill block.
1245          * We define attributes in the same order as SA_*_OFFSET in order to
1246          * work around the problem. See ORI-610.
1247          */
1248         cnt = 0;
1249         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(osd), NULL, &osa->mode, 8);
1250         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_SIZE(osd), NULL, &osa->size, 8);
1251         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GEN(osd), NULL, &gen, 8);
1252         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_UID(osd), NULL, &osa->uid, 8);
1253         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_GID(osd), NULL, &osa->gid, 8);
1254         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_PARENT(osd), NULL, &parent, 8);
1255         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(osd), NULL, &osa->flags, 8);
1256         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(osd), NULL, osa->atime, 16);
1257         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(osd), NULL, osa->mtime, 16);
1258         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(osd), NULL, osa->ctime, 16);
1259         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CRTIME(osd), NULL, crtime, 16);
1260         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_LINKS(osd), NULL, &osa->nlink, 8);
1261         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_RDEV(osd), NULL, &osa->rdev, 8);
1262
1263         rc = -sa_replace_all_by_template(sa_hdl, bulk, cnt, tx);
1264
1265         OBD_FREE(bulk, sizeof(sa_bulk_attr_t) * 13);
1266 out:
1267         sa_handle_destroy(sa_hdl);
1268         return rc;
1269 }
1270
1271 /*
1272  * The transaction passed to this routine must have
1273  * dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT) called and then assigned
1274  * to a transaction group.
1275  */
1276 int __osd_object_create(const struct lu_env *env, struct osd_object *obj,
1277                         dmu_buf_t **dbp, dmu_tx_t *tx, struct lu_attr *la,
1278                         uint64_t parent)
1279 {
1280         uint64_t             oid;
1281         int                  rc;
1282         struct osd_device   *osd = osd_obj2dev(obj);
1283         const struct lu_fid *fid = lu_object_fid(&obj->oo_dt.do_lu);
1284         dmu_object_type_t    type = DMU_OT_PLAIN_FILE_CONTENTS;
1285
1286         /* Assert that the transaction has been assigned to a
1287            transaction group. */
1288         LASSERT(tx->tx_txg != 0);
1289
1290         /* Use DMU_OTN_UINT8_METADATA for local objects so their data blocks
1291          * would get an additional ditto copy */
1292         if (unlikely(S_ISREG(la->la_mode) &&
1293                      fid_seq_is_local_file(fid_seq(fid))))
1294                 type = DMU_OTN_UINT8_METADATA;
1295
1296         /* Create a new DMU object using the default dnode size. */
1297         oid = osd_dmu_object_alloc(osd->od_os, type, 0, 0, tx);
1298         rc = -sa_buf_hold(osd->od_os, oid, osd_obj_tag, dbp);
1299         LASSERTF(rc == 0, "sa_buf_hold "LPU64" failed: %d\n", oid, rc);
1300
1301         LASSERT(la->la_valid & LA_MODE);
1302         la->la_size = 0;
1303         la->la_nlink = 1;
1304
1305         rc = __osd_attr_init(env, osd, oid, tx, la, parent);
1306         if (rc != 0) {
1307                 sa_buf_rele(*dbp, osd_obj_tag);
1308                 *dbp = NULL;
1309                 dmu_object_free(osd->od_os, oid, tx);
1310                 return rc;
1311         }
1312
1313         return 0;
1314 }
1315
1316 /*
1317  * The transaction passed to this routine must have
1318  * dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, ...) called and then assigned
1319  * to a transaction group.
1320  *
1321  * Using ZAP_FLAG_HASH64 will force the ZAP to always be a FAT ZAP.
1322  * This is fine for directories today, because storing the FID in the dirent
1323  * will also require a FAT ZAP.  If there is a new type of micro ZAP created
1324  * then we might need to re-evaluate the use of this flag and instead do
1325  * a conversion from the different internal ZAP hash formats being used. */
1326 int __osd_zap_create(const struct lu_env *env, struct osd_device *osd,
1327                      dmu_buf_t **zap_dbp, dmu_tx_t *tx,
1328                      struct lu_attr *la, uint64_t parent, zap_flags_t flags)
1329 {
1330         uint64_t oid;
1331         int      rc;
1332
1333         /* Assert that the transaction has been assigned to a
1334            transaction group. */
1335         LASSERT(tx->tx_txg != 0);
1336
1337         oid = osd_zap_create_flags(osd->od_os, 0, flags | ZAP_FLAG_HASH64,
1338                                    DMU_OT_DIRECTORY_CONTENTS,
1339                                    14, /* == ZFS fzap_default_blockshift */
1340                                    DN_MAX_INDBLKSHIFT, /* indirect blockshift */
1341                                    0, tx);
1342
1343         rc = -sa_buf_hold(osd->od_os, oid, osd_obj_tag, zap_dbp);
1344         if (rc)
1345                 return rc;
1346
1347         LASSERT(la->la_valid & LA_MODE);
1348         la->la_size = 2;
1349         la->la_nlink = 1;
1350
1351         return __osd_attr_init(env, osd, oid, tx, la, parent);
1352 }
1353
1354 static dmu_buf_t *osd_mkidx(const struct lu_env *env, struct osd_object *obj,
1355                             struct lu_attr *la, uint64_t parent,
1356                             struct osd_thandle *oh)
1357 {
1358         dmu_buf_t *db;
1359         int        rc;
1360
1361         /* Index file should be created as regular file in order not to confuse
1362          * ZPL which could interpret them as directory.
1363          * We set ZAP_FLAG_UINT64_KEY to let ZFS know than we are going to use
1364          * binary keys */
1365         LASSERT(S_ISREG(la->la_mode));
1366         rc = __osd_zap_create(env, osd_obj2dev(obj), &db, oh->ot_tx, la, parent,
1367                               ZAP_FLAG_UINT64_KEY);
1368         if (rc)
1369                 return ERR_PTR(rc);
1370         return db;
1371 }
1372
1373 static dmu_buf_t *osd_mkdir(const struct lu_env *env, struct osd_object *obj,
1374                             struct lu_attr *la, uint64_t parent,
1375                             struct osd_thandle *oh)
1376 {
1377         dmu_buf_t *db;
1378         int        rc;
1379
1380         LASSERT(S_ISDIR(la->la_mode));
1381         rc = __osd_zap_create(env, osd_obj2dev(obj), &db,
1382                               oh->ot_tx, la, parent, 0);
1383         if (rc)
1384                 return ERR_PTR(rc);
1385         return db;
1386 }
1387
1388 static dmu_buf_t *osd_mkreg(const struct lu_env *env, struct osd_object *obj,
1389                             struct lu_attr *la, uint64_t parent,
1390                             struct osd_thandle *oh)
1391 {
1392         const struct lu_fid *fid = lu_object_fid(&obj->oo_dt.do_lu);
1393         dmu_buf_t           *db;
1394         int                  rc;
1395         struct osd_device *osd = osd_obj2dev(obj);
1396
1397         LASSERT(S_ISREG(la->la_mode));
1398         rc = __osd_object_create(env, obj, &db, oh->ot_tx, la, parent);
1399         if (rc)
1400                 return ERR_PTR(rc);
1401
1402         /*
1403          * XXX: This heuristic is non-optimal.  It would be better to
1404          * increase the blocksize up to osd->od_max_blksz during the write.
1405          * This is exactly how the ZPL behaves and it ensures that the right
1406          * blocksize is selected based on the file size rather than the
1407          * making broad assumptions based on the osd type.
1408          */
1409         if ((fid_is_idif(fid) || fid_is_norm(fid)) && osd->od_is_ost) {
1410                 rc = -dmu_object_set_blocksize(osd->od_os, db->db_object,
1411                                                osd->od_max_blksz, 0, oh->ot_tx);
1412                 if (unlikely(rc)) {
1413                         CERROR("%s: can't change blocksize: %d\n",
1414                                osd->od_svname, rc);
1415                         return ERR_PTR(rc);
1416                 }
1417         }
1418
1419         return db;
1420 }
1421
1422 static dmu_buf_t *osd_mksym(const struct lu_env *env, struct osd_object *obj,
1423                             struct lu_attr *la, uint64_t parent,
1424                             struct osd_thandle *oh)
1425 {
1426         dmu_buf_t *db;
1427         int        rc;
1428
1429         LASSERT(S_ISLNK(la->la_mode));
1430         rc = __osd_object_create(env, obj, &db, oh->ot_tx, la, parent);
1431         if (rc)
1432                 return ERR_PTR(rc);
1433         return db;
1434 }
1435
1436 static dmu_buf_t *osd_mknod(const struct lu_env *env, struct osd_object *obj,
1437                             struct lu_attr *la, uint64_t parent,
1438                             struct osd_thandle *oh)
1439 {
1440         dmu_buf_t *db;
1441         int        rc;
1442
1443         la->la_valid = LA_MODE;
1444         if (S_ISCHR(la->la_mode) || S_ISBLK(la->la_mode))
1445                 la->la_valid |= LA_RDEV;
1446
1447         rc = __osd_object_create(env, obj, &db, oh->ot_tx, la, parent);
1448         if (rc)
1449                 return ERR_PTR(rc);
1450         return db;
1451 }
1452
1453 typedef dmu_buf_t *(*osd_obj_type_f)(const struct lu_env *env,
1454                                      struct osd_object *obj,
1455                                      struct lu_attr *la,
1456                                      uint64_t parent,
1457                                      struct osd_thandle *oh);
1458
1459 static osd_obj_type_f osd_create_type_f(enum dt_format_type type)
1460 {
1461         osd_obj_type_f result;
1462
1463         switch (type) {
1464         case DFT_DIR:
1465                 result = osd_mkdir;
1466                 break;
1467         case DFT_INDEX:
1468                 result = osd_mkidx;
1469                 break;
1470         case DFT_REGULAR:
1471                 result = osd_mkreg;
1472                 break;
1473         case DFT_SYM:
1474                 result = osd_mksym;
1475                 break;
1476         case DFT_NODE:
1477                 result = osd_mknod;
1478                 break;
1479         default:
1480                 LBUG();
1481                 break;
1482         }
1483         return result;
1484 }
1485
1486 /*
1487  * Primitives for directory (i.e. ZAP) handling
1488  */
1489 static inline int osd_init_lma(const struct lu_env *env, struct osd_object *obj,
1490                                const struct lu_fid *fid, struct osd_thandle *oh)
1491 {
1492         struct osd_thread_info  *info = osd_oti_get(env);
1493         struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs;
1494         struct lu_buf            buf;
1495         int rc;
1496
1497         lustre_lma_init(lma, fid, 0, 0);
1498         lustre_lma_swab(lma);
1499         buf.lb_buf = lma;
1500         buf.lb_len = sizeof(*lma);
1501
1502         rc = osd_xattr_set_internal(env, obj, &buf, XATTR_NAME_LMA,
1503                                     LU_XATTR_CREATE, oh);
1504
1505         return rc;
1506 }
1507
1508 /*
1509  * Concurrency: @dt is write locked.
1510  */
1511 static int osd_object_create(const struct lu_env *env, struct dt_object *dt,
1512                              struct lu_attr *attr,
1513                              struct dt_allocation_hint *hint,
1514                              struct dt_object_format *dof,
1515                              struct thandle *th)
1516 {
1517         struct zpl_direntry     *zde = &osd_oti_get(env)->oti_zde.lzd_reg;
1518         const struct lu_fid     *fid = lu_object_fid(&dt->do_lu);
1519         struct osd_object       *obj = osd_dt_obj(dt);
1520         struct osd_device       *osd = osd_obj2dev(obj);
1521         char                    *buf = osd_oti_get(env)->oti_str;
1522         struct osd_thandle      *oh;
1523         dmu_buf_t               *db;
1524         uint64_t                 zapid;
1525         int                      rc;
1526
1527         ENTRY;
1528
1529         /* concurrent create declarations should not see
1530          * the object inconsistent (db, attr, etc).
1531          * in regular cases acquisition should be cheap */
1532         down_write(&obj->oo_guard);
1533
1534         if (unlikely(dt_object_exists(dt)))
1535                 GOTO(out, rc = -EEXIST);
1536
1537         LASSERT(osd_invariant(obj));
1538         LASSERT(dof != NULL);
1539
1540         LASSERT(th != NULL);
1541         oh = container_of0(th, struct osd_thandle, ot_super);
1542
1543         /*
1544          * XXX missing: Quote handling.
1545          */
1546
1547         LASSERT(obj->oo_db == NULL);
1548
1549         /* to follow ZFS on-disk format we need
1550          * to initialize parent dnode properly */
1551         zapid = 0;
1552         if (hint != NULL && hint->dah_parent != NULL &&
1553             !dt_object_remote(hint->dah_parent))
1554                 zapid = osd_dt_obj(hint->dah_parent)->oo_db->db_object;
1555
1556         db = osd_create_type_f(dof->dof_type)(env, obj, attr, zapid, oh);
1557         if (IS_ERR(db))
1558                 GOTO(out, rc = PTR_ERR(db));
1559
1560         zde->zde_pad = 0;
1561         zde->zde_dnode = db->db_object;
1562         zde->zde_type = IFTODT(attr->la_mode & S_IFMT);
1563
1564         zapid = osd_get_name_n_idx(env, osd, fid, buf);
1565
1566         rc = -zap_add(osd->od_os, zapid, buf, 8, 1, zde, oh->ot_tx);
1567         if (rc)
1568                 GOTO(out, rc);
1569
1570         /* Add new object to inode accounting.
1571          * Errors are not considered as fatal */
1572         rc = -zap_increment_int(osd->od_os, osd->od_iusr_oid,
1573                                 (attr->la_valid & LA_UID) ? attr->la_uid : 0, 1,
1574                                 oh->ot_tx);
1575         if (rc)
1576                 CERROR("%s: failed to add "DFID" to accounting ZAP for usr %d "
1577                         "(%d)\n", osd->od_svname, PFID(fid), attr->la_uid, rc);
1578         rc = -zap_increment_int(osd->od_os, osd->od_igrp_oid,
1579                                 (attr->la_valid & LA_GID) ? attr->la_gid : 0, 1,
1580                                 oh->ot_tx);
1581         if (rc)
1582                 CERROR("%s: failed to add "DFID" to accounting ZAP for grp %d "
1583                         "(%d)\n", osd->od_svname, PFID(fid), attr->la_gid, rc);
1584
1585         /* configure new osd object */
1586         obj->oo_db = db;
1587         rc = osd_object_init0(env, obj);
1588         LASSERT(ergo(rc == 0, dt_object_exists(dt)));
1589         LASSERT(osd_invariant(obj));
1590
1591         rc = osd_init_lma(env, obj, fid, oh);
1592         if (rc != 0)
1593                 CERROR("%s: can not set LMA on "DFID": rc = %d\n",
1594                        osd->od_svname, PFID(fid), rc);
1595
1596 out:
1597         up_write(&obj->oo_guard);
1598         RETURN(rc);
1599 }
1600
1601 static int osd_declare_object_ref_add(const struct lu_env *env,
1602                                       struct dt_object *dt,
1603                                       struct thandle *th)
1604 {
1605         return osd_declare_attr_set(env, dt, NULL, th);
1606 }
1607
1608 /*
1609  * Concurrency: @dt is write locked.
1610  */
1611 static int osd_object_ref_add(const struct lu_env *env,
1612                               struct dt_object *dt,
1613                               struct thandle *handle)
1614 {
1615         struct osd_object       *obj = osd_dt_obj(dt);
1616         struct osd_thandle      *oh;
1617         struct osd_device       *osd = osd_obj2dev(obj);
1618         uint64_t                 nlink;
1619         int rc;
1620
1621         ENTRY;
1622
1623         down_read(&obj->oo_guard);
1624         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
1625                 GOTO(out, rc = -ENOENT);
1626
1627         LASSERT(osd_invariant(obj));
1628         LASSERT(obj->oo_sa_hdl != NULL);
1629
1630         oh = container_of0(handle, struct osd_thandle, ot_super);
1631
1632         write_lock(&obj->oo_attr_lock);
1633         nlink = ++obj->oo_attr.la_nlink;
1634         write_unlock(&obj->oo_attr_lock);
1635
1636         rc = osd_object_sa_update(obj, SA_ZPL_LINKS(osd), &nlink, 8, oh);
1637
1638 out:
1639         up_read(&obj->oo_guard);
1640         RETURN(rc);
1641 }
1642
1643 static int osd_declare_object_ref_del(const struct lu_env *env,
1644                                       struct dt_object *dt,
1645                                       struct thandle *handle)
1646 {
1647         return osd_declare_attr_set(env, dt, NULL, handle);
1648 }
1649
1650 /*
1651  * Concurrency: @dt is write locked.
1652  */
1653 static int osd_object_ref_del(const struct lu_env *env,
1654                               struct dt_object *dt,
1655                               struct thandle *handle)
1656 {
1657         struct osd_object       *obj = osd_dt_obj(dt);
1658         struct osd_thandle      *oh;
1659         struct osd_device       *osd = osd_obj2dev(obj);
1660         uint64_t                 nlink;
1661         int                      rc;
1662
1663         ENTRY;
1664
1665         down_read(&obj->oo_guard);
1666
1667         if (unlikely(!dt_object_exists(dt) || obj->oo_destroyed))
1668                 GOTO(out, rc = -ENOENT);
1669
1670         LASSERT(osd_invariant(obj));
1671         LASSERT(obj->oo_sa_hdl != NULL);
1672
1673         oh = container_of0(handle, struct osd_thandle, ot_super);
1674         LASSERT(!lu_object_is_dying(dt->do_lu.lo_header));
1675
1676         write_lock(&obj->oo_attr_lock);
1677         nlink = --obj->oo_attr.la_nlink;
1678         write_unlock(&obj->oo_attr_lock);
1679
1680         rc = osd_object_sa_update(obj, SA_ZPL_LINKS(osd), &nlink, 8, oh);
1681
1682 out:
1683         up_read(&obj->oo_guard);
1684         RETURN(rc);
1685 }
1686
1687 static int osd_object_sync(const struct lu_env *env, struct dt_object *dt,
1688                            __u64 start, __u64 end)
1689 {
1690         struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
1691         ENTRY;
1692
1693         /* XXX: no other option than syncing the whole filesystem until we
1694          * support ZIL.  If the object tracked the txg that it was last
1695          * modified in, it could pass that txg here instead of "0".  Maybe
1696          * the changes are already committed, so no wait is needed at all? */
1697         txg_wait_synced(dmu_objset_pool(osd->od_os), 0ULL);
1698
1699         RETURN(0);
1700 }
1701
1702 static struct dt_object_operations osd_obj_ops = {
1703         .do_read_lock           = osd_object_read_lock,
1704         .do_write_lock          = osd_object_write_lock,
1705         .do_read_unlock         = osd_object_read_unlock,
1706         .do_write_unlock        = osd_object_write_unlock,
1707         .do_write_locked        = osd_object_write_locked,
1708         .do_attr_get            = osd_attr_get,
1709         .do_declare_attr_set    = osd_declare_attr_set,
1710         .do_attr_set            = osd_attr_set,
1711         .do_ah_init             = osd_ah_init,
1712         .do_declare_create      = osd_declare_object_create,
1713         .do_create              = osd_object_create,
1714         .do_declare_destroy     = osd_declare_object_destroy,
1715         .do_destroy             = osd_object_destroy,
1716         .do_index_try           = osd_index_try,
1717         .do_declare_ref_add     = osd_declare_object_ref_add,
1718         .do_ref_add             = osd_object_ref_add,
1719         .do_declare_ref_del     = osd_declare_object_ref_del,
1720         .do_ref_del             = osd_object_ref_del,
1721         .do_xattr_get           = osd_xattr_get,
1722         .do_declare_xattr_set   = osd_declare_xattr_set,
1723         .do_xattr_set           = osd_xattr_set,
1724         .do_declare_xattr_del   = osd_declare_xattr_del,
1725         .do_xattr_del           = osd_xattr_del,
1726         .do_xattr_list          = osd_xattr_list,
1727         .do_object_sync         = osd_object_sync,
1728 };
1729
1730 static struct lu_object_operations osd_lu_obj_ops = {
1731         .loo_object_init        = osd_object_init,
1732         .loo_object_delete      = osd_object_delete,
1733         .loo_object_release     = osd_object_release,
1734         .loo_object_free        = osd_object_free,
1735         .loo_object_print       = osd_object_print,
1736         .loo_object_invariant   = osd_object_invariant,
1737 };
1738
1739 static int osd_otable_it_attr_get(const struct lu_env *env,
1740                                 struct dt_object *dt,
1741                                 struct lu_attr *attr)
1742 {
1743         attr->la_valid = 0;
1744         return 0;
1745 }
1746
1747 static struct dt_object_operations osd_obj_otable_it_ops = {
1748         .do_attr_get    = osd_otable_it_attr_get,
1749         .do_index_try   = osd_index_try,
1750 };