4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2012, 2014 Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
32 * lustre/ofd/ofd_objects.c
34 * This file contains OSD API methods related to OBD Filter Device (OFD)
37 * Author: Alex Zhuravlev <alexey.zhuravlev@intel.com>
38 * Author: Mikhail Pershin <mike.pershin@intel.com>
41 #define DEBUG_SUBSYSTEM S_FILTER
43 #include <dt_object.h>
44 #include <lustre/lustre_idl.h>
45 #include <lustre_lfsck.h>
47 #include "ofd_internal.h"
50 * Get object version from disk and check it.
52 * This function checks object version from disk with
53 * ofd_thread_info::fti_pre_version filled from incoming RPC. This is part of
54 * VBR (Version-Based Recovery) and ensures that object has the same version
55 * upon replay as it has during original modification.
57 * \param[in] info execution thread OFD private data
58 * \param[in] fo OFD object
60 * \retval 0 if version matches
61 * \retval -EOVERFLOW on version mismatch
63 int ofd_version_get_check(struct ofd_thread_info *info,
64 struct ofd_object *fo)
66 dt_obj_version_t curr_version;
68 LASSERT(ofd_object_exists(fo));
70 if (info->fti_exp == NULL)
73 curr_version = dt_version_get(info->fti_env, ofd_object_child(fo));
74 if ((__s64)curr_version == -EOPNOTSUPP)
76 /* VBR: version is checked always because costs nothing */
77 if (info->fti_pre_version != 0 &&
78 info->fti_pre_version != curr_version) {
79 CDEBUG(D_INODE, "Version mismatch "LPX64" != "LPX64"\n",
80 info->fti_pre_version, curr_version);
81 spin_lock(&info->fti_exp->exp_lock);
82 info->fti_exp->exp_vbr_failed = 1;
83 spin_unlock(&info->fti_exp->exp_lock);
86 info->fti_pre_version = curr_version;
91 * Get OFD object by FID.
93 * This function finds OFD slice of compound object with the given FID.
95 * \param[in] env execution environment
96 * \param[in] ofd OFD device
97 * \param[in] fid FID of the object
99 * \retval pointer to the found ofd_object
100 * \retval ERR_PTR(errno) in case of error
102 struct ofd_object *ofd_object_find(const struct lu_env *env,
103 struct ofd_device *ofd,
104 const struct lu_fid *fid)
106 struct ofd_object *fo;
111 o = lu_object_find(env, &ofd->ofd_dt_dev.dd_lu_dev, fid, NULL);
112 if (likely(!IS_ERR(o)))
115 fo = ERR_CAST(o); /* return error */
121 * Get FID of parent MDT object.
123 * This function reads extended attribute XATTR_NAME_FID of OFD object which
124 * contains the MDT parent object FID and saves it in ofd_object::ofo_pfid.
126 * The filter_fid::ff_parent::f_ver field currently holds
127 * the OST-object index in the parent MDT-object's layout EA,
128 * not the actual FID::f_ver of the parent. We therefore access
129 * it via the macro f_stripe_idx.
131 * \param[in] env execution environment
132 * \param[in] fo OFD object
134 * \retval 0 if successful
135 * \retval -ENODATA if there is no such xattr
136 * \retval negative value on error
138 int ofd_object_ff_load(const struct lu_env *env, struct ofd_object *fo)
140 struct ofd_thread_info *info = ofd_info(env);
141 struct filter_fid_old *ff = &info->fti_mds_fid_old;
142 struct lu_buf *buf = &info->fti_buf;
143 struct lu_fid *pfid = &fo->ofo_pfid;
146 if (fid_is_sane(pfid))
150 buf->lb_len = sizeof(*ff);
151 rc = dt_xattr_get(env, ofd_object_child(fo), buf, XATTR_NAME_FID,
156 if (rc < sizeof(struct lu_fid)) {
162 pfid->f_seq = le64_to_cpu(ff->ff_parent.f_seq);
163 pfid->f_oid = le32_to_cpu(ff->ff_parent.f_oid);
164 pfid->f_stripe_idx = le32_to_cpu(ff->ff_parent.f_stripe_idx);
170 * Put OFD object reference.
172 * \param[in] env execution environment
173 * \param[in] fo OFD object
175 void ofd_object_put(const struct lu_env *env, struct ofd_object *fo)
177 lu_object_put(env, &fo->ofo_obj.do_lu);
181 * Precreate the given number \a nr of objects in the given sequence \a oseq.
183 * This function precreates new OST objects in the given sequence.
184 * The precreation starts from \a id and creates \a nr objects sequentially.
187 * This function may create fewer objects than requested.
189 * We mark object SUID+SGID to flag it for accepting UID+GID from client on
190 * first write. Currently the permission bits on the OST are never used,
193 * Initialize a/c/m time so any client timestamp will always be newer and
194 * update the inode. The ctime = 0 case is also handled specially in
195 * osd_inode_setattr(). See LU-221, LU-1042 for details.
197 * \param[in] env execution environment
198 * \param[in] ofd OFD device
199 * \param[in] id object ID to start precreation from
200 * \param[in] oseq object sequence
201 * \param[in] nr number of objects to precreate
202 * \param[in] sync synchronous precreation flag
204 * \retval 0 if successful
205 * \retval negative value on error
207 int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd,
208 obd_id id, struct ofd_seq *oseq, int nr, int sync)
210 struct ofd_thread_info *info = ofd_info(env);
211 struct ofd_object *fo = NULL;
212 struct dt_object *next;
214 struct ofd_object **batch;
215 struct lu_fid *fid = &info->fti_fid;
224 /* Don't create objects beyond the valid range for this SEQ */
225 if (unlikely(fid_seq_is_mdt0(ostid_seq(&oseq->os_oi)) &&
226 (id + nr) >= IDIF_MAX_OID)) {
227 CERROR("%s:"DOSTID" hit the IDIF_MAX_OID (1<<48)!\n",
228 ofd_name(ofd), id, ostid_seq(&oseq->os_oi));
229 RETURN(rc = -ENOSPC);
230 } else if (unlikely(!fid_seq_is_mdt0(ostid_seq(&oseq->os_oi)) &&
231 (id + nr) >= OBIF_MAX_OID)) {
232 CERROR("%s:"DOSTID" hit the OBIF_MAX_OID (1<<32)!\n",
233 ofd_name(ofd), id, ostid_seq(&oseq->os_oi));
234 RETURN(rc = -ENOSPC);
237 OBD_ALLOC(batch, nr_saved * sizeof(struct ofd_object *));
241 info->fti_attr.la_valid = LA_TYPE | LA_MODE;
242 info->fti_attr.la_mode = S_IFREG | S_ISUID | S_ISGID | 0666;
243 info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
245 info->fti_attr.la_valid |= LA_ATIME | LA_MTIME | LA_CTIME;
246 info->fti_attr.la_atime = 0;
247 info->fti_attr.la_mtime = 0;
248 info->fti_attr.la_ctime = 0;
252 /* prepare objects */
253 *fid = *lu_object_fid(&oseq->os_lastid_obj->do_lu);
254 for (i = 0; i < nr; i++) {
255 rc = fid_set_id(fid, id + i);
264 fo = ofd_object_find(env, ofd, fid);
267 GOTO(out, rc = PTR_ERR(fo));
273 ofd_write_lock(env, fo);
276 info->fti_buf.lb_buf = &tmp;
277 info->fti_buf.lb_len = sizeof(tmp);
280 th = ofd_trans_create(env, ofd);
282 GOTO(out, rc = PTR_ERR(th));
286 rc = dt_declare_record_write(env, oseq->os_lastid_obj, &info->fti_buf,
289 GOTO(trans_stop, rc);
291 for (i = 0; i < nr; i++) {
295 if (unlikely(ofd_object_exists(fo))) {
296 /* object may exist being re-created by write replay */
297 CDEBUG(D_INODE, "object "LPX64"/"LPX64" exists: "
298 DFID"\n", ostid_seq(&oseq->os_oi), id,
299 PFID(lu_object_fid(&fo->ofo_obj.do_lu)));
303 next = ofd_object_child(fo);
304 LASSERT(next != NULL);
306 rc = dt_declare_create(env, next, &info->fti_attr, NULL,
310 GOTO(trans_stop, rc);
317 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
319 GOTO(trans_stop, rc);
321 CDEBUG(D_OTHER, "%s: create new object "DFID" nr %d\n",
322 ofd_name(ofd), PFID(fid), nr);
324 /* When the LFSCK scanning the whole device to verify the LAST_ID file
325 * consistency, it will load the last_id into RAM firstly, and compare
326 * the last_id with each OST-object's ID. If the later one is larger,
327 * then it will regard the LAST_ID file crashed. But during the LFSCK
328 * scanning, the OFD may continue to create new OST-objects. Those new
329 * created OST-objects will have larger IDs than the LFSCK known ones.
330 * So from the LFSCK view, it needs to re-load the last_id from disk
331 * file, and if the latest last_id is still smaller than the object's
332 * ID, then the LAST_ID file is real crashed.
334 * To make above mechanism to work, before OFD pre-create OST-objects,
335 * it needs to update the LAST_ID file firstly, otherwise, the LFSCK
336 * may cannot get latest last_id although new OST-object created. */
337 if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_SKIP_LASTID)) {
338 tmp = cpu_to_le64(id + nr - 1);
339 dt_write_lock(env, oseq->os_lastid_obj, 0);
340 rc = dt_record_write(env, oseq->os_lastid_obj,
341 &info->fti_buf, &info->fti_off, th);
342 dt_write_unlock(env, oseq->os_lastid_obj);
344 GOTO(trans_stop, rc);
347 for (i = 0; i < nr; i++) {
351 /* Only the new created objects need to be recorded. */
352 if (ofd->ofd_osd->dd_record_fid_accessed) {
353 lfsck_pack_rfa(&ofd_info(env)->fti_lr,
354 lu_object_fid(&fo->ofo_obj.do_lu));
355 lfsck_in_notify(env, ofd->ofd_osd,
356 &ofd_info(env)->fti_lr);
359 if (likely(!ofd_object_exists(fo) &&
360 !OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DANGLING))) {
361 next = ofd_object_child(fo);
362 LASSERT(next != NULL);
364 rc = dt_create(env, next, &info->fti_attr, NULL,
368 GOTO(trans_stop, rc);
373 LASSERT(ofd_object_exists(fo));
375 ofd_seq_last_oid_set(oseq, id + i);
379 /* NOT all the wanted objects have been created,
380 * set the LAST_ID as the real created. */
381 if (unlikely(objects < nr)) {
385 tmp = cpu_to_le64(ofd_seq_last_oid(oseq));
386 dt_write_lock(env, oseq->os_lastid_obj, 0);
387 rc1 = dt_record_write(env, oseq->os_lastid_obj,
388 &info->fti_buf, &info->fti_off, th);
389 dt_write_unlock(env, oseq->os_lastid_obj);
391 CERROR("%s: fail to reset the LAST_ID for seq ("LPX64
392 ") from "LPU64" to "LPU64"\n", ofd_name(ofd),
393 ostid_seq(&oseq->os_oi), id + nr - 1,
394 ofd_seq_last_oid(oseq));
398 ofd_trans_stop(env, ofd, th, rc);
400 for (i = 0; i < nr_saved; i++) {
403 ofd_write_unlock(env, fo);
404 ofd_object_put(env, fo);
407 OBD_FREE(batch, nr_saved * sizeof(struct ofd_object *));
409 CDEBUG((objects == 0 && rc == 0) ? D_ERROR : D_OTHER,
410 "created %d/%d objects: %d\n", objects, nr_saved, rc);
412 LASSERT(ergo(objects == 0, rc < 0));
413 RETURN(objects > 0 ? objects : rc);
417 * Fix the OFD object ownership.
419 * If the object still has SUID+SGID bits set, meaning that it was precreated
420 * by the MDT before it was assigned to any file, (see ofd_precreate_objects())
421 * then we will accept the UID+GID if sent by the client for initializing the
422 * ownership of this object. We only allow this to happen once (so clear these
423 * bits) and later only allow setattr.
425 * \param[in] env execution environment
426 * \param[in] fo OFD object
427 * \param[in] la object attributes
428 * \param[in] is_setattr was this function called from setattr or not
430 * \retval 0 if successful
431 * \retval negative value on error
433 int ofd_attr_handle_ugid(const struct lu_env *env, struct ofd_object *fo,
434 struct lu_attr *la, int is_setattr)
436 struct ofd_thread_info *info = ofd_info(env);
437 struct lu_attr *ln = &info->fti_attr2;
443 if (!(la->la_valid & LA_UID) && !(la->la_valid & LA_GID))
446 rc = dt_attr_get(env, ofd_object_child(fo), ln, BYPASS_CAPA);
450 LASSERT(ln->la_valid & LA_MODE);
453 if (!(ln->la_mode & S_ISUID))
454 la->la_valid &= ~LA_UID;
455 if (!(ln->la_mode & S_ISGID))
456 la->la_valid &= ~LA_GID;
459 if ((la->la_valid & LA_UID) && (ln->la_mode & S_ISUID))
461 if ((la->la_valid & LA_GID) && (ln->la_mode & S_ISGID))
464 if (!(la->la_valid & LA_MODE) || !is_setattr) {
465 la->la_mode = ln->la_mode;
466 la->la_valid |= LA_MODE;
468 la->la_mode &= ~mask;
475 * Set OFD object attributes.
477 * This function sets OFD object attributes taken from incoming request.
478 * It sets not only regular attributes but also XATTR_NAME_FID extended
479 * attribute if needed. The "fid" xattr allows the object's MDT parent inode
480 * to be found and verified by LFSCK and other tools in case of inconsistency.
482 * \param[in] env execution environment
483 * \param[in] fo OFD object
484 * \param[in] la object attributes
485 * \param[in] ff filter_fid structure, contains additional attributes
487 * \retval 0 if successful
488 * \retval negative value on error
490 int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
491 struct lu_attr *la, struct filter_fid *ff)
493 struct ofd_thread_info *info = ofd_info(env);
494 struct ofd_device *ofd = ofd_obj2dev(fo);
496 struct ofd_mod_data *fmd;
501 ofd_write_lock(env, fo);
502 if (!ofd_object_exists(fo))
503 GOTO(unlock, rc = -ENOENT);
505 if (la->la_valid & (LA_ATIME | LA_MTIME | LA_CTIME)) {
506 fmd = ofd_fmd_get(info->fti_exp, &fo->ofo_header.loh_fid);
507 if (fmd && fmd->fmd_mactime_xid < info->fti_xid)
508 fmd->fmd_mactime_xid = info->fti_xid;
509 ofd_fmd_put(info->fti_exp, fmd);
512 /* VBR: version recovery check */
513 rc = ofd_version_get_check(info, fo);
517 rc = ofd_attr_handle_ugid(env, fo, la, 1 /* is_setattr */);
522 rc = ofd_object_ff_load(env, fo);
529 th = ofd_trans_create(env, ofd);
531 GOTO(unlock, rc = PTR_ERR(th));
533 rc = dt_declare_attr_set(env, ofd_object_child(fo), la, th);
538 info->fti_buf.lb_buf = ff;
539 info->fti_buf.lb_len = sizeof(*ff);
540 rc = dt_declare_xattr_set(env, ofd_object_child(fo),
541 &info->fti_buf, XATTR_NAME_FID, 0,
547 rc = ofd_trans_start(env, ofd, la->la_valid & LA_SIZE ? fo : NULL, th);
551 rc = dt_attr_set(env, ofd_object_child(fo), la, th,
552 ofd_object_capa(env, fo));
557 rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
558 XATTR_NAME_FID, 0, th, BYPASS_CAPA);
560 fo->ofo_pfid.f_seq = le64_to_cpu(ff->ff_parent.f_seq);
561 fo->ofo_pfid.f_oid = le32_to_cpu(ff->ff_parent.f_oid);
562 /* Currently, the filter_fid::ff_parent::f_ver is not
563 * the real parent MDT-object's FID::f_ver, instead it
564 * is the OST-object index in its parent MDT-object's
566 fo->ofo_pfid.f_stripe_idx =
567 le32_to_cpu(ff->ff_parent.f_stripe_idx);
574 ofd_trans_stop(env, ofd, th, rc);
576 ofd_write_unlock(env, fo);
582 * Truncate/punch OFD object.
584 * This function frees all of the allocated object's space from the \a start
585 * offset to the \a end offset. For truncate() operations the \a end offset
586 * is OBD_OBJECT_EOF. The functionality to punch holes in an object via
587 * fallocate(FALLOC_FL_PUNCH_HOLE) is not yet implemented (see LU-3606).
589 * \param[in] env execution environment
590 * \param[in] fo OFD object
591 * \param[in] start start offset to punch from
592 * \param[in] end end of punch
593 * \param[in] la object attributes
594 * \param[in] ff filter_fid structure
595 * \param[in] oa obdo struct from incoming request
597 * \retval 0 if successful
598 * \retval negative value on error
600 int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
601 __u64 start, __u64 end, struct lu_attr *la,
602 struct filter_fid *ff, struct obdo *oa)
604 struct ofd_thread_info *info = ofd_info(env);
605 struct ofd_device *ofd = ofd_obj2dev(fo);
606 struct ofd_mod_data *fmd;
607 struct dt_object *dob = ofd_object_child(fo);
614 /* we support truncate, not punch yet */
615 LASSERT(end == OBD_OBJECT_EOF);
617 fmd = ofd_fmd_get(info->fti_exp, &fo->ofo_header.loh_fid);
618 if (fmd && fmd->fmd_mactime_xid < info->fti_xid)
619 fmd->fmd_mactime_xid = info->fti_xid;
620 ofd_fmd_put(info->fti_exp, fmd);
622 ofd_write_lock(env, fo);
623 if (!ofd_object_exists(fo))
624 GOTO(unlock, rc = -ENOENT);
626 if (ofd->ofd_lfsck_verify_pfid && oa->o_valid & OBD_MD_FLFID) {
627 rc = ofd_verify_ff(env, fo, oa);
632 /* VBR: version recovery check */
633 rc = ofd_version_get_check(info, fo);
637 rc = ofd_attr_handle_ugid(env, fo, la, 0 /* !is_setattr */);
642 rc = ofd_object_ff_load(env, fo);
649 th = ofd_trans_create(env, ofd);
651 GOTO(unlock, rc = PTR_ERR(th));
653 rc = dt_declare_attr_set(env, dob, la, th);
657 rc = dt_declare_punch(env, dob, start, OBD_OBJECT_EOF, th);
662 info->fti_buf.lb_buf = ff;
663 info->fti_buf.lb_len = sizeof(*ff);
664 rc = dt_declare_xattr_set(env, ofd_object_child(fo),
665 &info->fti_buf, XATTR_NAME_FID, 0,
671 rc = ofd_trans_start(env, ofd, fo, th);
675 rc = dt_punch(env, dob, start, OBD_OBJECT_EOF, th,
676 ofd_object_capa(env, fo));
680 rc = dt_attr_set(env, dob, la, th, ofd_object_capa(env, fo));
685 rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
686 XATTR_NAME_FID, 0, th, BYPASS_CAPA);
688 fo->ofo_pfid.f_seq = le64_to_cpu(ff->ff_parent.f_seq);
689 fo->ofo_pfid.f_oid = le32_to_cpu(ff->ff_parent.f_oid);
690 /* Currently, the filter_fid::ff_parent::f_ver is not
691 * the real parent MDT-object's FID::f_ver, instead it
692 * is the OST-object index in its parent MDT-object's
694 fo->ofo_pfid.f_stripe_idx =
695 le32_to_cpu(ff->ff_parent.f_stripe_idx);
702 ofd_trans_stop(env, ofd, th, rc);
704 ofd_write_unlock(env, fo);
710 * Destroy OFD object.
712 * This function destroys OFD object. If object wasn't used at all (orphan)
713 * then local transaction is used, which means the transaction data is not
714 * returned back in reply.
716 * \param[in] env execution environment
717 * \param[in] fo OFD object
718 * \param[in] orphan flag to indicate that object is orphaned
720 * \retval 0 if successful
721 * \retval negative value on error
723 int ofd_object_destroy(const struct lu_env *env, struct ofd_object *fo,
726 struct ofd_device *ofd = ofd_obj2dev(fo);
732 ofd_write_lock(env, fo);
733 if (!ofd_object_exists(fo))
734 GOTO(unlock, rc = -ENOENT);
736 th = ofd_trans_create(env, ofd);
738 GOTO(unlock, rc = PTR_ERR(th));
740 dt_declare_ref_del(env, ofd_object_child(fo), th);
741 dt_declare_destroy(env, ofd_object_child(fo), th);
743 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
745 rc = ofd_trans_start(env, ofd, NULL, th);
749 ofd_fmd_drop(ofd_info(env)->fti_exp, &fo->ofo_header.loh_fid);
751 dt_ref_del(env, ofd_object_child(fo), th);
752 dt_destroy(env, ofd_object_child(fo), th);
754 ofd_trans_stop(env, ofd, th, rc);
756 ofd_write_unlock(env, fo);
761 * Get OFD object attributes.
763 * This function gets OFD object regular attributes. It is used to serve
764 * incoming request as well as for local OFD purposes.
766 * \param[in] env execution environment
767 * \param[in] fo OFD object
768 * \param[in] la object attributes
770 * \retval 0 if successful
771 * \retval negative value on error
773 int ofd_attr_get(const struct lu_env *env, struct ofd_object *fo,
780 if (ofd_object_exists(fo)) {
781 rc = dt_attr_get(env, ofd_object_child(fo), la,
782 ofd_object_capa(env, fo));
784 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 53, 0)
785 /* Try to correct for a bug in 2.1.0 (LU-221) that caused
786 * negative timestamps to appear to be in the far future,
787 * due old timestamp being stored on disk as an unsigned value.
788 * This fixes up any bad values stored on disk before
789 * returning them to the client, and ensures any timestamp
790 * updates are correct. LU-1042 */
791 if (unlikely(la->la_atime == LU221_BAD_TIME))
793 if (unlikely(la->la_mtime == LU221_BAD_TIME))
795 if (unlikely(la->la_ctime == LU221_BAD_TIME))