4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2012, 2013, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/ofd/ofd_objects.c
38 * Author: Alex Zhuravlev <bzzz@whamcloud.com>
39 * Author: Mikhail Pershin <tappro@whamcloud.com>
42 #define DEBUG_SUBSYSTEM S_FILTER
44 #include <dt_object.h>
45 #include <lustre/lustre_idl.h>
46 #include <lustre_lfsck.h>
48 #include "ofd_internal.h"
50 int ofd_version_get_check(struct ofd_thread_info *info,
51 struct ofd_object *fo)
53 dt_obj_version_t curr_version;
55 LASSERT(ofd_object_exists(fo));
60 curr_version = dt_version_get(info->fti_env, ofd_object_child(fo));
61 if ((__s64)curr_version == -EOPNOTSUPP)
63 /* VBR: version is checked always because costs nothing */
64 if (info->fti_pre_version != 0 &&
65 info->fti_pre_version != curr_version) {
66 CDEBUG(D_INODE, "Version mismatch "LPX64" != "LPX64"\n",
67 info->fti_pre_version, curr_version);
68 spin_lock(&info->fti_exp->exp_lock);
69 info->fti_exp->exp_vbr_failed = 1;
70 spin_unlock(&info->fti_exp->exp_lock);
73 info->fti_pre_version = curr_version;
77 struct ofd_object *ofd_object_find(const struct lu_env *env,
78 struct ofd_device *ofd,
79 const struct lu_fid *fid)
81 struct ofd_object *fo;
86 o = lu_object_find(env, &ofd->ofd_dt_dev.dd_lu_dev, fid, NULL);
87 if (likely(!IS_ERR(o)))
90 fo = ERR_CAST(o); /* return error */
95 struct ofd_object *ofd_object_find_or_create(const struct lu_env *env,
96 struct ofd_device *ofd,
97 const struct lu_fid *fid,
100 struct ofd_thread_info *info = ofd_info(env);
101 struct lu_object *fo_obj;
102 struct dt_object *dto;
106 info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
108 dto = dt_find_or_create(env, ofd->ofd_osd, fid, &info->fti_dof, attr);
110 RETURN(ERR_CAST(dto));
112 fo_obj = lu_object_locate(dto->do_lu.lo_header,
113 ofd->ofd_dt_dev.dd_lu_dev.ld_type);
114 RETURN(ofd_obj(fo_obj));
117 int ofd_object_ff_load(const struct lu_env *env, struct ofd_object *fo)
119 struct ofd_thread_info *info = ofd_info(env);
120 struct filter_fid_old *ff = &info->fti_mds_fid_old;
121 struct lu_buf *buf = &info->fti_buf;
122 struct lu_fid *pfid = &fo->ofo_pfid;
125 if (fid_is_sane(pfid))
129 buf->lb_len = sizeof(*ff);
130 rc = dt_xattr_get(env, ofd_object_child(fo), buf, XATTR_NAME_FID,
135 if (rc < sizeof(struct lu_fid)) {
141 pfid->f_seq = le64_to_cpu(ff->ff_parent.f_seq);
142 pfid->f_oid = le32_to_cpu(ff->ff_parent.f_oid);
143 /* Currently, the filter_fid::ff_parent::f_ver is not the real parent
144 * MDT-object's FID::f_ver, instead it is the OST-object index in its
145 * parent MDT-object's layout EA. */
146 pfid->f_stripe_idx = le32_to_cpu(ff->ff_parent.f_stripe_idx);
151 void ofd_object_put(const struct lu_env *env, struct ofd_object *fo)
153 lu_object_put(env, &fo->ofo_obj.do_lu);
156 int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd,
157 obd_id id, struct ofd_seq *oseq, int nr, int sync)
159 struct ofd_thread_info *info = ofd_info(env);
160 struct ofd_object *fo = NULL;
161 struct dt_object *next;
163 struct ofd_object **batch;
164 struct lu_fid *fid = &info->fti_fid;
173 /* Don't create objects beyond the valid range for this SEQ */
174 if (unlikely(fid_seq_is_mdt0(ostid_seq(&oseq->os_oi)) &&
175 (id + nr) >= IDIF_MAX_OID)) {
176 CERROR("%s:"DOSTID" hit the IDIF_MAX_OID (1<<48)!\n",
177 ofd_name(ofd), id, ostid_seq(&oseq->os_oi));
178 RETURN(rc = -ENOSPC);
179 } else if (unlikely(!fid_seq_is_mdt0(ostid_seq(&oseq->os_oi)) &&
180 (id + nr) >= OBIF_MAX_OID)) {
181 CERROR("%s:"DOSTID" hit the OBIF_MAX_OID (1<<32)!\n",
182 ofd_name(ofd), id, ostid_seq(&oseq->os_oi));
183 RETURN(rc = -ENOSPC);
186 OBD_ALLOC(batch, nr_saved * sizeof(struct ofd_object *));
190 info->fti_attr.la_valid = LA_TYPE | LA_MODE;
192 * We mark object SUID+SGID to flag it for accepting UID+GID from
193 * client on first write. Currently the permission bits on the OST are
194 * never used, so this is OK.
196 info->fti_attr.la_mode = S_IFREG | S_ISUID | S_ISGID | 0666;
197 info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
199 /* Initialize a/c/m time so any client timestamp will always
200 * be newer and update the inode. ctime = 0 is also handled
201 * specially in osd_inode_setattr(). See LU-221, LU-1042 */
202 info->fti_attr.la_valid |= LA_ATIME | LA_MTIME | LA_CTIME;
203 info->fti_attr.la_atime = 0;
204 info->fti_attr.la_mtime = 0;
205 info->fti_attr.la_ctime = 0;
209 /* prepare objects */
210 *fid = *lu_object_fid(&oseq->os_lastid_obj->do_lu);
211 for (i = 0; i < nr; i++) {
212 rc = fid_set_id(fid, id + i);
221 fo = ofd_object_find(env, ofd, fid);
224 GOTO(out, rc = PTR_ERR(fo));
230 ofd_write_lock(env, fo);
233 info->fti_buf.lb_buf = &tmp;
234 info->fti_buf.lb_len = sizeof(tmp);
237 th = ofd_trans_create(env, ofd);
239 GOTO(out, rc = PTR_ERR(th));
243 rc = dt_declare_record_write(env, oseq->os_lastid_obj, &info->fti_buf,
246 GOTO(trans_stop, rc);
248 for (i = 0; i < nr; i++) {
252 if (unlikely(ofd_object_exists(fo))) {
253 /* object may exist being re-created by write replay */
254 CDEBUG(D_INODE, "object "LPX64"/"LPX64" exists: "
255 DFID"\n", ostid_seq(&oseq->os_oi), id,
256 PFID(lu_object_fid(&fo->ofo_obj.do_lu)));
260 next = ofd_object_child(fo);
261 LASSERT(next != NULL);
263 rc = dt_declare_create(env, next, &info->fti_attr, NULL,
271 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
273 GOTO(trans_stop, rc);
275 CDEBUG(D_OTHER, "%s: create new object "DFID" nr %d\n",
276 ofd_name(ofd), PFID(fid), nr);
280 /* When the LFSCK scanning the whole device to verify the LAST_ID file
281 * consistency, it will load the last_id into RAM firstly, and compare
282 * the last_id with each OST-object's ID. If the later one is larger,
283 * then it will regard the LAST_ID file crashed. But during the LFSCK
284 * scanning, the OFD may continue to create new OST-objects. Those new
285 * created OST-objects will have larger IDs than the LFSCK known ones.
286 * So from the LFSCK view, it needs to re-load the last_id from disk
287 * file, and if the latest last_id is still smaller than the object's
288 * ID, then the LAST_ID file is real crashed.
290 * To make above mechanism to work, before OFD pre-create OST-objects,
291 * it needs to update the LAST_ID file firstly, otherwise, the LFSCK
292 * may cannot get latest last_id although new OST-object created. */
293 if (!OBD_FAIL_CHECK(OBD_FAIL_LFSCK_SKIP_LASTID)) {
294 tmp = cpu_to_le64(id + nr - 1);
295 dt_write_lock(env, oseq->os_lastid_obj, 0);
296 rc = dt_record_write(env, oseq->os_lastid_obj,
297 &info->fti_buf, &info->fti_off, th);
298 dt_write_unlock(env, oseq->os_lastid_obj);
300 GOTO(trans_stop, rc);
303 for (i = 0; i < nr; i++) {
307 /* Only the new created objects need to be recorded. */
308 if (ofd->ofd_osd->dd_record_fid_accessed) {
309 lfsck_pack_rfa(&ofd_info(env)->fti_lr,
310 lu_object_fid(&fo->ofo_obj.do_lu));
311 lfsck_in_notify(env, ofd->ofd_osd,
312 &ofd_info(env)->fti_lr);
315 if (likely(!ofd_object_exists(fo) &&
316 !OBD_FAIL_CHECK(OBD_FAIL_LFSCK_DANGLING))) {
317 next = ofd_object_child(fo);
318 LASSERT(next != NULL);
320 rc = dt_create(env, next, &info->fti_attr, NULL,
324 LASSERT(ofd_object_exists(fo));
326 ofd_seq_last_oid_set(oseq, id + i);
330 /* NOT all the wanted objects have been created,
331 * set the LAST_ID as the real created. */
332 if (unlikely(objects < nr)) {
336 tmp = cpu_to_le64(ofd_seq_last_oid(oseq));
337 dt_write_lock(env, oseq->os_lastid_obj, 0);
338 rc1 = dt_record_write(env, oseq->os_lastid_obj,
339 &info->fti_buf, &info->fti_off, th);
340 dt_write_unlock(env, oseq->os_lastid_obj);
342 CERROR("%s: fail to reset the LAST_ID for seq ("LPX64
343 ") from "LPU64" to "LPU64"\n", ofd_name(ofd),
344 ostid_seq(&oseq->os_oi), id + nr - 1,
345 ofd_seq_last_oid(oseq));
349 ofd_trans_stop(env, ofd, th, rc);
351 for (i = 0; i < nr_saved; i++) {
354 ofd_write_unlock(env, fo);
355 ofd_object_put(env, fo);
358 OBD_FREE(batch, nr_saved * sizeof(struct ofd_object *));
360 CDEBUG((objects == 0 && rc == 0) ? D_ERROR : D_OTHER,
361 "created %d/%d objects: %d\n", objects, nr_saved, rc);
363 LASSERT(ergo(objects == 0, rc < 0));
364 RETURN(objects > 0 ? objects : rc);
368 * If the object still has SUID+SGID bits set (see ofd_precreate_object()) then
369 * we will accept the UID+GID if sent by the client for initializing the
370 * ownership of this object. We only allow this to happen once (so clear these
371 * bits) and later only allow setattr.
373 int ofd_attr_handle_ugid(const struct lu_env *env, struct ofd_object *fo,
374 struct lu_attr *la, int is_setattr)
376 struct ofd_thread_info *info = ofd_info(env);
377 struct lu_attr *ln = &info->fti_attr2;
383 if (!(la->la_valid & LA_UID) && !(la->la_valid & LA_GID))
386 rc = dt_attr_get(env, ofd_object_child(fo), ln, BYPASS_CAPA);
390 LASSERT(ln->la_valid & LA_MODE);
393 if (!(ln->la_mode & S_ISUID))
394 la->la_valid &= ~LA_UID;
395 if (!(ln->la_mode & S_ISGID))
396 la->la_valid &= ~LA_GID;
399 if ((la->la_valid & LA_UID) && (ln->la_mode & S_ISUID))
401 if ((la->la_valid & LA_GID) && (ln->la_mode & S_ISGID))
404 if (!(la->la_valid & LA_MODE) || !is_setattr) {
405 la->la_mode = ln->la_mode;
406 la->la_valid |= LA_MODE;
408 la->la_mode &= ~mask;
414 int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
415 struct lu_attr *la, struct filter_fid *ff)
417 struct ofd_thread_info *info = ofd_info(env);
418 struct ofd_device *ofd = ofd_obj2dev(fo);
420 struct ofd_mod_data *fmd;
425 ofd_write_lock(env, fo);
426 if (!ofd_object_exists(fo))
427 GOTO(unlock, rc = -ENOENT);
429 if (la->la_valid & (LA_ATIME | LA_MTIME | LA_CTIME)) {
430 fmd = ofd_fmd_get(info->fti_exp, &fo->ofo_header.loh_fid);
431 if (fmd && fmd->fmd_mactime_xid < info->fti_xid)
432 fmd->fmd_mactime_xid = info->fti_xid;
433 ofd_fmd_put(info->fti_exp, fmd);
436 /* VBR: version recovery check */
437 rc = ofd_version_get_check(info, fo);
441 rc = ofd_attr_handle_ugid(env, fo, la, 1 /* is_setattr */);
446 rc = ofd_object_ff_load(env, fo);
453 th = ofd_trans_create(env, ofd);
455 GOTO(unlock, rc = PTR_ERR(th));
457 rc = dt_declare_attr_set(env, ofd_object_child(fo), la, th);
462 info->fti_buf.lb_buf = ff;
463 info->fti_buf.lb_len = sizeof(*ff);
464 rc = dt_declare_xattr_set(env, ofd_object_child(fo),
465 &info->fti_buf, XATTR_NAME_FID, 0,
471 rc = ofd_trans_start(env, ofd, la->la_valid & LA_SIZE ? fo : NULL, th);
475 rc = dt_attr_set(env, ofd_object_child(fo), la, th,
476 ofd_object_capa(env, fo));
481 rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
482 XATTR_NAME_FID, 0, th, BYPASS_CAPA);
484 fo->ofo_pfid.f_seq = le64_to_cpu(ff->ff_parent.f_seq);
485 fo->ofo_pfid.f_oid = le32_to_cpu(ff->ff_parent.f_oid);
486 /* Currently, the filter_fid::ff_parent::f_ver is not
487 * the real parent MDT-object's FID::f_ver, instead it
488 * is the OST-object index in its parent MDT-object's
490 fo->ofo_pfid.f_stripe_idx =
491 le32_to_cpu(ff->ff_parent.f_stripe_idx);
498 ofd_trans_stop(env, ofd, th, rc);
500 ofd_write_unlock(env, fo);
505 int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
506 __u64 start, __u64 end, struct lu_attr *la,
507 struct filter_fid *ff, struct obdo *oa)
509 struct ofd_thread_info *info = ofd_info(env);
510 struct ofd_device *ofd = ofd_obj2dev(fo);
511 struct ofd_mod_data *fmd;
512 struct dt_object *dob = ofd_object_child(fo);
519 /* we support truncate, not punch yet */
520 LASSERT(end == OBD_OBJECT_EOF);
522 fmd = ofd_fmd_get(info->fti_exp, &fo->ofo_header.loh_fid);
523 if (fmd && fmd->fmd_mactime_xid < info->fti_xid)
524 fmd->fmd_mactime_xid = info->fti_xid;
525 ofd_fmd_put(info->fti_exp, fmd);
527 ofd_write_lock(env, fo);
528 if (!ofd_object_exists(fo))
529 GOTO(unlock, rc = -ENOENT);
531 if (ofd->ofd_lfsck_verify_pfid && oa->o_valid & OBD_MD_FLFID) {
532 rc = ofd_verify_ff(env, fo, oa);
537 /* VBR: version recovery check */
538 rc = ofd_version_get_check(info, fo);
542 rc = ofd_attr_handle_ugid(env, fo, la, 0 /* !is_setattr */);
547 rc = ofd_object_ff_load(env, fo);
554 th = ofd_trans_create(env, ofd);
556 GOTO(unlock, rc = PTR_ERR(th));
558 rc = dt_declare_attr_set(env, dob, la, th);
562 rc = dt_declare_punch(env, dob, start, OBD_OBJECT_EOF, th);
567 info->fti_buf.lb_buf = ff;
568 info->fti_buf.lb_len = sizeof(*ff);
569 rc = dt_declare_xattr_set(env, ofd_object_child(fo),
570 &info->fti_buf, XATTR_NAME_FID, 0,
576 rc = ofd_trans_start(env, ofd, fo, th);
580 rc = dt_punch(env, dob, start, OBD_OBJECT_EOF, th,
581 ofd_object_capa(env, fo));
585 rc = dt_attr_set(env, dob, la, th, ofd_object_capa(env, fo));
590 rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
591 XATTR_NAME_FID, 0, th, BYPASS_CAPA);
593 fo->ofo_pfid.f_seq = le64_to_cpu(ff->ff_parent.f_seq);
594 fo->ofo_pfid.f_oid = le32_to_cpu(ff->ff_parent.f_oid);
595 /* Currently, the filter_fid::ff_parent::f_ver is not
596 * the real parent MDT-object's FID::f_ver, instead it
597 * is the OST-object index in its parent MDT-object's
599 fo->ofo_pfid.f_stripe_idx =
600 le32_to_cpu(ff->ff_parent.f_stripe_idx);
607 ofd_trans_stop(env, ofd, th, rc);
609 ofd_write_unlock(env, fo);
614 int ofd_object_destroy(const struct lu_env *env, struct ofd_object *fo,
617 struct ofd_device *ofd = ofd_obj2dev(fo);
623 ofd_write_lock(env, fo);
624 if (!ofd_object_exists(fo))
625 GOTO(unlock, rc = -ENOENT);
627 th = ofd_trans_create(env, ofd);
629 GOTO(unlock, rc = PTR_ERR(th));
631 dt_declare_ref_del(env, ofd_object_child(fo), th);
632 dt_declare_destroy(env, ofd_object_child(fo), th);
634 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
636 rc = ofd_trans_start(env, ofd, NULL, th);
640 ofd_fmd_drop(ofd_info(env)->fti_exp, &fo->ofo_header.loh_fid);
642 dt_ref_del(env, ofd_object_child(fo), th);
643 dt_destroy(env, ofd_object_child(fo), th);
645 ofd_trans_stop(env, ofd, th, rc);
647 ofd_write_unlock(env, fo);
651 int ofd_attr_get(const struct lu_env *env, struct ofd_object *fo,
658 if (ofd_object_exists(fo)) {
659 rc = dt_attr_get(env, ofd_object_child(fo), la,
660 ofd_object_capa(env, fo));
662 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
663 /* Try to correct for a bug in 2.1.0 (LU-221) that caused
664 * negative timestamps to appear to be in the far future,
665 * due old timestamp being stored on disk as an unsigned value.
666 * This fixes up any bad values stored on disk before
667 * returning them to the client, and ensures any timestamp
668 * updates are correct. LU-1042 */
669 if (unlikely(la->la_atime == LU221_BAD_TIME))
671 if (unlikely(la->la_mtime == LU221_BAD_TIME))
673 if (unlikely(la->la_ctime == LU221_BAD_TIME))
676 #warning "remove old LU-221/LU-1042 workaround code"