4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2012, 2014, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
32 * lustre/ofd/ofd_objects.c
34 * This file contains OSD API methods related to OBD Filter Device (OFD)
37 * Author: Alex Zhuravlev <alexey.zhuravlev@intel.com>
38 * Author: Mikhail Pershin <mike.pershin@intel.com>
41 #define DEBUG_SUBSYSTEM S_FILTER
43 #include <dt_object.h>
44 #include <lustre_lfsck.h>
46 #include "ofd_internal.h"
49 * Get object version from disk and check it.
51 * This function checks object version from disk with
52 * ofd_thread_info::fti_pre_version filled from incoming RPC. This is part of
53 * VBR (Version-Based Recovery) and ensures that object has the same version
54 * upon replay as it has during original modification.
56 * \param[in] info execution thread OFD private data
57 * \param[in] fo OFD object
59 * \retval 0 if version matches
60 * \retval -EOVERFLOW on version mismatch
62 static int ofd_version_get_check(struct ofd_thread_info *info,
63 struct ofd_object *fo)
65 dt_obj_version_t curr_version;
67 LASSERT(ofd_object_exists(fo));
69 if (info->fti_exp == NULL)
72 curr_version = dt_version_get(info->fti_env, ofd_object_child(fo));
73 if ((__s64)curr_version == -EOPNOTSUPP)
75 /* VBR: version is checked always because costs nothing */
76 if (info->fti_pre_version != 0 &&
77 info->fti_pre_version != curr_version) {
78 CDEBUG(D_INODE, "Version mismatch %#llx != %#llx\n",
79 info->fti_pre_version, curr_version);
80 spin_lock(&info->fti_exp->exp_lock);
81 info->fti_exp->exp_vbr_failed = 1;
82 spin_unlock(&info->fti_exp->exp_lock);
85 info->fti_pre_version = curr_version;
90 * Get OFD object by FID.
92 * This function finds OFD slice of compound object with the given FID.
94 * \param[in] env execution environment
95 * \param[in] ofd OFD device
96 * \param[in] fid FID of the object
98 * \retval pointer to the found ofd_object
99 * \retval ERR_PTR(errno) in case of error
101 struct ofd_object *ofd_object_find(const struct lu_env *env,
102 struct ofd_device *ofd,
103 const struct lu_fid *fid)
105 struct ofd_object *fo;
110 o = lu_object_find(env, &ofd->ofd_dt_dev.dd_lu_dev, fid, NULL);
111 if (likely(!IS_ERR(o)))
114 fo = ERR_CAST(o); /* return error */
120 * Get FID of parent MDT object.
122 * This function reads extended attribute XATTR_NAME_FID of OFD object which
123 * contains the MDT parent object FID and saves it in ofd_object::ofo_ff.
125 * The filter_fid::ff_parent::f_ver field currently holds
126 * the OST-object index in the parent MDT-object's layout EA,
127 * not the actual FID::f_ver of the parent. We therefore access
128 * it via the macro f_stripe_idx.
130 * \param[in] env execution environment
131 * \param[in] fo OFD object
133 * \retval 0 if successful
134 * \retval -ENODATA if there is no such xattr
135 * \retval negative value on error
137 int ofd_object_ff_load(const struct lu_env *env, struct ofd_object *fo)
139 struct ofd_thread_info *info = ofd_info(env);
140 struct filter_fid *ff = &fo->ofo_ff;
141 struct lu_buf *buf = &info->fti_buf;
144 if (fid_is_sane(&ff->ff_parent))
148 buf->lb_len = sizeof(*ff);
149 rc = dt_xattr_get(env, ofd_object_child(fo), buf, XATTR_NAME_FID);
153 if (unlikely(rc < sizeof(struct lu_fid))) {
154 fid_zero(&ff->ff_parent);
159 filter_fid_le_to_cpu(ff, ff, rc);
165 * Precreate the given number \a nr of objects in the given sequence \a oseq.
167 * This function precreates new OST objects in the given sequence.
168 * The precreation starts from \a id and creates \a nr objects sequentially.
171 * This function may create fewer objects than requested.
173 * We mark object SUID+SGID to flag it for accepting UID+GID from client on
174 * first write. Currently the permission bits on the OST are never used,
177 * Initialize a/c/m time so any client timestamp will always be newer and
178 * update the inode. The ctime = 0 case is also handled specially in
179 * osd_inode_setattr(). See LU-221, LU-1042 for details.
181 * \param[in] env execution environment
182 * \param[in] ofd OFD device
183 * \param[in] id object ID to start precreation from
184 * \param[in] oseq object sequence
185 * \param[in] nr number of objects to precreate
186 * \param[in] sync synchronous precreation flag
188 * \retval 0 if successful
189 * \retval negative value on error
191 int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd,
192 u64 id, struct ofd_seq *oseq, int nr, int sync)
194 struct ofd_thread_info *info = ofd_info(env);
195 struct ofd_object *fo = NULL;
196 struct dt_object *next;
198 struct ofd_object **batch;
199 struct lu_fid *fid = &info->fti_fid;
209 /* Don't create objects beyond the valid range for this SEQ */
210 if (unlikely(fid_seq_is_mdt0(ostid_seq(&oseq->os_oi)) &&
211 (id + nr) >= IDIF_MAX_OID)) {
212 CERROR("%s:"DOSTID" hit the IDIF_MAX_OID (1<<48)!\n",
213 ofd_name(ofd), id, ostid_seq(&oseq->os_oi));
214 RETURN(rc = -ENOSPC);
215 } else if (unlikely(!fid_seq_is_mdt0(ostid_seq(&oseq->os_oi)) &&
216 (id + nr) >= OBIF_MAX_OID)) {
217 CERROR("%s:"DOSTID" hit the OBIF_MAX_OID (1<<32)!\n",
218 ofd_name(ofd), id, ostid_seq(&oseq->os_oi));
219 RETURN(rc = -ENOSPC);
222 OBD_ALLOC(batch, nr_saved * sizeof(struct ofd_object *));
226 info->fti_attr.la_valid = LA_TYPE | LA_MODE;
227 info->fti_attr.la_mode = S_IFREG | S_ISUID | S_ISGID | S_ISVTX | 0666;
228 info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
230 info->fti_attr.la_valid |= LA_ATIME | LA_MTIME | LA_CTIME;
231 info->fti_attr.la_atime = 0;
232 info->fti_attr.la_mtime = 0;
233 info->fti_attr.la_ctime = 0;
237 /* prepare objects */
238 *fid = *lu_object_fid(&oseq->os_lastid_obj->do_lu);
239 for (i = 0; i < nr; i++) {
240 rc = fid_set_id(fid, id + i);
249 fo = ofd_object_find(env, ofd, fid);
252 GOTO(out, rc = PTR_ERR(fo));
258 ofd_write_lock(env, fo);
261 info->fti_buf.lb_buf = &tmp;
262 info->fti_buf.lb_len = sizeof(tmp);
265 th = ofd_trans_create(env, ofd);
267 GOTO(out, rc = PTR_ERR(th));
271 rc = dt_declare_record_write(env, oseq->os_lastid_obj, &info->fti_buf,
274 GOTO(trans_stop, rc);
276 for (i = 0; i < nr; i++) {
280 if (unlikely(ofd_object_exists(fo))) {
281 /* object may exist being re-created by write replay */
282 CDEBUG(D_INODE, "object %#llx/%#llx exists: "
283 DFID"\n", ostid_seq(&oseq->os_oi), id,
284 PFID(lu_object_fid(&fo->ofo_obj.do_lu)));
288 next = ofd_object_child(fo);
289 LASSERT(next != NULL);
291 rc = dt_declare_create(env, next, &info->fti_attr, NULL,
295 GOTO(trans_stop, rc);
302 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
304 GOTO(trans_stop, rc);
306 CDEBUG(D_OTHER, "%s: create new object "DFID" nr %d\n",
307 ofd_name(ofd), PFID(fid), nr);
309 /* When the LFSCK scanning the whole device to verify the LAST_ID file
310 * consistency, it will load the last_id into RAM firstly, and compare
311 * the last_id with each OST-object's ID. If the later one is larger,
312 * then it will regard the LAST_ID file crashed. But during the LFSCK
313 * scanning, the OFD may continue to create new OST-objects. Those new
314 * created OST-objects will have larger IDs than the LFSCK known ones.
315 * So from the LFSCK view, it needs to re-load the last_id from disk
316 * file, and if the latest last_id is still smaller than the object's
317 * ID, then the LAST_ID file is real crashed.
319 * To make above mechanism to work, before OFD pre-create OST-objects,
320 * it needs to update the LAST_ID file firstly, otherwise, the LFSCK
321 * may cannot get latest last_id although new OST-object created. */
322 if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_SKIP_LASTID)) {
323 tmp = cpu_to_le64(id + nr - 1);
324 dt_write_lock(env, oseq->os_lastid_obj, 0);
325 rc = dt_record_write(env, oseq->os_lastid_obj,
326 &info->fti_buf, &info->fti_off, th);
327 dt_write_unlock(env, oseq->os_lastid_obj);
329 GOTO(trans_stop, rc);
332 for (i = 0; i < nr; i++) {
336 /* Only the new created objects need to be recorded. */
337 if (ofd->ofd_osd->dd_record_fid_accessed) {
338 struct lfsck_req_local *lrl = &ofd_info(env)->fti_lrl;
340 lfsck_pack_rfa(lrl, lu_object_fid(&fo->ofo_obj.do_lu),
341 LEL_FID_ACCESSED, LFSCK_TYPE_LAYOUT);
342 lfsck_in_notify_local(env, ofd->ofd_osd, lrl, NULL);
345 if (likely(!ofd_object_exists(fo) &&
346 !OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DANGLING))) {
347 next = ofd_object_child(fo);
348 LASSERT(next != NULL);
350 rc = dt_create(env, next, &info->fti_attr, NULL,
354 GOTO(trans_stop, rc);
359 LASSERT(ofd_object_exists(fo));
361 ofd_seq_last_oid_set(oseq, id + i);
365 /* NOT all the wanted objects have been created,
366 * set the LAST_ID as the real created. */
367 if (unlikely(objects < nr)) {
371 tmp = cpu_to_le64(ofd_seq_last_oid(oseq));
372 dt_write_lock(env, oseq->os_lastid_obj, 0);
373 rc1 = dt_record_write(env, oseq->os_lastid_obj,
374 &info->fti_buf, &info->fti_off, th);
375 dt_write_unlock(env, oseq->os_lastid_obj);
377 CERROR("%s: fail to reset the LAST_ID for seq (%#llx"
378 ") from %llu to %llu\n", ofd_name(ofd),
379 ostid_seq(&oseq->os_oi), id + nr - 1,
380 ofd_seq_last_oid(oseq));
384 rc2 = ofd_trans_stop(env, ofd, th, rc);
386 CERROR("%s: failed to stop transaction: rc = %d\n",
391 for (i = 0; i < nr_saved; i++) {
394 ofd_write_unlock(env, fo);
395 ofd_object_put(env, fo);
398 OBD_FREE(batch, nr_saved * sizeof(struct ofd_object *));
400 CDEBUG((objects == 0 && rc == 0) ? D_ERROR : D_OTHER,
401 "created %d/%d objects: %d\n", objects, nr_saved, rc);
403 LASSERT(ergo(objects == 0, rc < 0));
404 RETURN(objects > 0 ? objects : rc);
408 * Fix the OFD object ownership.
410 * If the object still has SUID+SGID bits set, meaning that it was precreated
411 * by the MDT before it was assigned to any file, (see ofd_precreate_objects())
412 * then we will accept the UID/GID/PROJID if sent by the client for initializing
413 * the ownership of this object. We only allow this to happen once (so clear
414 * these bits) and later only allow setattr.
416 * \param[in] env execution environment
417 * \param[in] fo OFD object
418 * \param[in] la object attributes
419 * \param[in] is_setattr was this function called from setattr or not
421 * \retval 0 if successful
422 * \retval negative value on error
424 int ofd_attr_handle_id(const struct lu_env *env, struct ofd_object *fo,
425 struct lu_attr *la, int is_setattr)
427 struct ofd_thread_info *info = ofd_info(env);
428 struct lu_attr *ln = &info->fti_attr2;
434 if (!(la->la_valid & LA_UID) && !(la->la_valid & LA_GID) &&
435 !(la->la_valid & LA_PROJID))
438 rc = dt_attr_get(env, ofd_object_child(fo), ln);
442 LASSERT(ln->la_valid & LA_MODE);
445 * Only allow setattr to change UID/GID/PROJID, if
446 * SUID+SGID is not set which means this is not
447 * initialization of this objects.
450 if (!(ln->la_mode & S_ISUID))
451 la->la_valid &= ~LA_UID;
452 if (!(ln->la_mode & S_ISGID))
453 la->la_valid &= ~LA_GID;
454 if (!(ln->la_mode & S_ISVTX))
455 la->la_valid &= ~LA_PROJID;
458 /* Initialize ownership of this object, clear SUID+SGID bits*/
459 if ((la->la_valid & LA_UID) && (ln->la_mode & S_ISUID))
461 if ((la->la_valid & LA_GID) && (ln->la_mode & S_ISGID))
463 if ((la->la_valid & LA_PROJID) && (ln->la_mode & S_ISVTX))
466 if (!(la->la_valid & LA_MODE) || !is_setattr) {
467 la->la_mode = ln->la_mode;
468 la->la_valid |= LA_MODE;
470 la->la_mode &= ~mask;
477 * Set OFD object attributes.
479 * This function sets OFD object attributes taken from incoming request.
480 * It sets not only regular attributes but also XATTR_NAME_FID extended
481 * attribute if needed. The "fid" xattr allows the object's MDT parent inode
482 * to be found and verified by LFSCK and other tools in case of inconsistency.
484 * \param[in] env execution environment
485 * \param[in] fo OFD object
486 * \param[in] la object attributes
487 * \param[in] ff filter_fid structure, contains additional attributes
489 * \retval 0 if successful
490 * \retval negative value on error
492 int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
493 struct lu_attr *la, struct filter_fid *ff)
495 struct ofd_thread_info *info = ofd_info(env);
496 struct ofd_device *ofd = ofd_obj2dev(fo);
498 struct ofd_mod_data *fmd;
504 ofd_write_lock(env, fo);
505 if (!ofd_object_exists(fo))
506 GOTO(unlock, rc = -ENOENT);
508 if (la->la_valid & (LA_ATIME | LA_MTIME | LA_CTIME)) {
509 fmd = ofd_fmd_get(info->fti_exp, &fo->ofo_header.loh_fid);
510 if (fmd && fmd->fmd_mactime_xid < info->fti_xid)
511 fmd->fmd_mactime_xid = info->fti_xid;
512 ofd_fmd_put(info->fti_exp, fmd);
515 /* VBR: version recovery check */
516 rc = ofd_version_get_check(info, fo);
520 rc = ofd_attr_handle_id(env, fo, la, 1 /* is_setattr */);
525 rc = ofd_object_ff_load(env, fo);
532 th = ofd_trans_create(env, ofd);
534 GOTO(unlock, rc = PTR_ERR(th));
536 rc = dt_declare_attr_set(env, ofd_object_child(fo), la, th);
541 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR1))
542 ff->ff_parent.f_oid = cpu_to_le32(1UL << 31);
543 else if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR2))
544 le32_add_cpu(&ff->ff_parent.f_oid, -1);
546 info->fti_buf.lb_buf = ff;
547 info->fti_buf.lb_len = sizeof(*ff);
548 rc = dt_declare_xattr_set(env, ofd_object_child(fo),
549 &info->fti_buf, XATTR_NAME_FID, 0,
555 rc = ofd_trans_start(env, ofd, la->la_valid & LA_SIZE ? fo : NULL, th);
559 rc = dt_attr_set(env, ofd_object_child(fo), la, th);
564 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NOPFID))
567 rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
568 XATTR_NAME_FID, 0, th);
570 filter_fid_le_to_cpu(&fo->ofo_ff, ff, sizeof(*ff));
576 rc2 = ofd_trans_stop(env, ofd, th, rc);
578 CERROR("%s: failed to stop transaction: rc = %d\n",
584 ofd_write_unlock(env, fo);
590 * Truncate/punch OFD object.
592 * This function frees all of the allocated object's space from the \a start
593 * offset to the \a end offset. For truncate() operations the \a end offset
594 * is OBD_OBJECT_EOF. The functionality to punch holes in an object via
595 * fallocate(FALLOC_FL_PUNCH_HOLE) is not yet implemented (see LU-3606).
597 * \param[in] env execution environment
598 * \param[in] fo OFD object
599 * \param[in] start start offset to punch from
600 * \param[in] end end of punch
601 * \param[in] la object attributes
602 * \param[in] ff filter_fid structure
603 * \param[in] oa obdo struct from incoming request
605 * \retval 0 if successful
606 * \retval negative value on error
608 int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
609 __u64 start, __u64 end, struct lu_attr *la,
610 struct filter_fid *ff, struct obdo *oa)
612 struct ofd_thread_info *info = ofd_info(env);
613 struct ofd_device *ofd = ofd_obj2dev(fo);
614 struct ofd_mod_data *fmd;
615 struct dt_object *dob = ofd_object_child(fo);
623 /* we support truncate, not punch yet */
624 LASSERT(end == OBD_OBJECT_EOF);
626 ofd_write_lock(env, fo);
627 fmd = ofd_fmd_get(info->fti_exp, &fo->ofo_header.loh_fid);
628 if (fmd && fmd->fmd_mactime_xid < info->fti_xid)
629 fmd->fmd_mactime_xid = info->fti_xid;
630 ofd_fmd_put(info->fti_exp, fmd);
632 if (!ofd_object_exists(fo))
633 GOTO(unlock, rc = -ENOENT);
635 if (ofd->ofd_lfsck_verify_pfid && oa->o_valid & OBD_MD_FLFID) {
636 rc = ofd_verify_ff(env, fo, oa);
641 /* VBR: version recovery check */
642 rc = ofd_version_get_check(info, fo);
646 rc = ofd_attr_handle_id(env, fo, la, 0 /* !is_setattr */);
651 rc = ofd_object_ff_load(env, fo);
658 th = ofd_trans_create(env, ofd);
660 GOTO(unlock, rc = PTR_ERR(th));
662 rc = dt_declare_attr_set(env, dob, la, th);
666 rc = dt_declare_punch(env, dob, start, OBD_OBJECT_EOF, th);
671 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR1))
672 ff->ff_parent.f_oid = cpu_to_le32(1UL << 31);
673 else if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR2))
674 le32_add_cpu(&ff->ff_parent.f_oid, -1);
676 info->fti_buf.lb_buf = ff;
677 info->fti_buf.lb_len = sizeof(*ff);
678 rc = dt_declare_xattr_set(env, ofd_object_child(fo),
679 &info->fti_buf, XATTR_NAME_FID, 0,
685 rc = ofd_trans_start(env, ofd, fo, th);
689 rc = dt_punch(env, dob, start, OBD_OBJECT_EOF, th);
693 rc = dt_attr_set(env, dob, la, th);
698 if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NOPFID))
701 rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
702 XATTR_NAME_FID, 0, th);
704 filter_fid_le_to_cpu(&fo->ofo_ff, ff, sizeof(*ff));
710 rc2 = ofd_trans_stop(env, ofd, th, rc);
712 CERROR("%s: failed to stop transaction: rc = %d\n",
717 ofd_write_unlock(env, fo);
723 * Destroy OFD object.
725 * This function destroys OFD object. If object wasn't used at all (orphan)
726 * then local transaction is used, which means the transaction data is not
727 * returned back in reply.
729 * \param[in] env execution environment
730 * \param[in] fo OFD object
731 * \param[in] orphan flag to indicate that object is orphaned
733 * \retval 0 if successful
734 * \retval negative value on error
736 int ofd_destroy(const struct lu_env *env, struct ofd_object *fo,
739 struct ofd_device *ofd = ofd_obj2dev(fo);
746 ofd_write_lock(env, fo);
747 if (!ofd_object_exists(fo))
748 GOTO(unlock, rc = -ENOENT);
750 th = ofd_trans_create(env, ofd);
752 GOTO(unlock, rc = PTR_ERR(th));
754 rc = dt_declare_ref_del(env, ofd_object_child(fo), th);
758 rc = dt_declare_destroy(env, ofd_object_child(fo), th);
763 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
765 rc = ofd_trans_start(env, ofd, NULL, th);
769 ofd_fmd_drop(ofd_info(env)->fti_exp, &fo->ofo_header.loh_fid);
771 dt_ref_del(env, ofd_object_child(fo), th);
772 dt_destroy(env, ofd_object_child(fo), th);
774 rc2 = ofd_trans_stop(env, ofd, th, rc);
776 CERROR("%s failed to stop transaction: %d\n",
781 ofd_write_unlock(env, fo);
786 * Get OFD object attributes.
788 * This function gets OFD object regular attributes. It is used to serve
789 * incoming request as well as for local OFD purposes.
791 * \param[in] env execution environment
792 * \param[in] fo OFD object
793 * \param[in] la object attributes
795 * \retval 0 if successful
796 * \retval negative value on error
798 int ofd_attr_get(const struct lu_env *env, struct ofd_object *fo,
805 if (ofd_object_exists(fo)) {
806 rc = dt_attr_get(env, ofd_object_child(fo), la);