4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Whamcloud, Inc.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/ofd/ofd_objects.c
38 * Author: Alex Zhuravlev <bzzz@whamcloud.com>
39 * Author: Mikhail Pershin <tappro@whamcloud.com>
42 #define DEBUG_SUBSYSTEM S_FILTER
44 #include <dt_object.h>
46 #include "ofd_internal.h"
48 int ofd_version_get_check(struct ofd_thread_info *info,
49 struct ofd_object *fo)
51 dt_obj_version_t curr_version;
53 LASSERT(ofd_object_exists(fo));
54 LASSERT(info->fti_exp);
56 curr_version = dt_version_get(info->fti_env, ofd_object_child(fo));
57 if ((__s64)curr_version == -EOPNOTSUPP)
59 /* VBR: version is checked always because costs nothing */
60 if (info->fti_pre_version != 0 &&
61 info->fti_pre_version != curr_version) {
62 CDEBUG(D_INODE, "Version mismatch "LPX64" != "LPX64"\n",
63 info->fti_pre_version, curr_version);
64 spin_lock(&info->fti_exp->exp_lock);
65 info->fti_exp->exp_vbr_failed = 1;
66 spin_unlock(&info->fti_exp->exp_lock);
69 info->fti_pre_version = curr_version;
73 struct ofd_object *ofd_object_find(const struct lu_env *env,
74 struct ofd_device *ofd,
75 const struct lu_fid *fid)
77 struct ofd_object *fo;
82 o = lu_object_find(env, &ofd->ofd_dt_dev.dd_lu_dev, fid, NULL);
83 if (likely(!IS_ERR(o)))
86 fo = (struct ofd_object *)o; /* return error */
90 struct ofd_object *ofd_object_find_or_create(const struct lu_env *env,
91 struct ofd_device *ofd,
92 const struct lu_fid *fid,
95 struct ofd_thread_info *info = ofd_info(env);
96 struct lu_object *fo_obj;
97 struct dt_object *dto;
101 info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
103 dto = dt_find_or_create(env, ofd->ofd_osd, fid, &info->fti_dof, attr);
105 RETURN((struct ofd_object *)dto);
107 fo_obj = lu_object_locate(dto->do_lu.lo_header,
108 ofd->ofd_dt_dev.dd_lu_dev.ld_type);
109 RETURN(ofd_obj(fo_obj));
112 int ofd_object_ff_check(const struct lu_env *env, struct ofd_object *fo)
114 struct ofd_thread_info *info = ofd_info(env);
119 if (!fo->ofo_ff_exists) {
121 * This actually means that we don't know whether the object
122 * has the "fid" EA or not.
124 info->fti_buf.lb_buf = &info->fti_mds_fid2;
125 info->fti_buf.lb_len = sizeof(info->fti_mds_fid2);
126 rc = dt_xattr_get(env, ofd_object_child(fo), &info->fti_buf,
127 XATTR_NAME_FID, BYPASS_CAPA);
128 if (rc >= 0 || rc == -ENODATA) {
130 * Here we assume that, if the object doesn't have the
131 * "fid" EA, the caller will add one, unless a fatal
132 * error (e.g., a memory or disk failure) prevents it
135 fo->ofo_ff_exists = 1;
143 void ofd_object_put(const struct lu_env *env, struct ofd_object *fo)
145 lu_object_put(env, &fo->ofo_obj.do_lu);
148 int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd,
149 obd_id id, obd_seq seq, int nr)
151 struct ofd_thread_info *info = ofd_info(env);
152 struct ofd_object *fo = NULL;
153 struct dt_object *next;
155 struct ofd_object **batch;
164 /* Don't create objects beyond the valid range for this SEQ */
165 if (unlikely(fid_seq_is_mdt0(seq) && (id + nr) >= IDIF_MAX_OID)) {
166 CERROR("%s:"POSTID" hit the IDIF_MAX_OID (1<<48)!\n",
167 ofd_name(ofd), id, seq);
168 RETURN(rc = -ENOSPC);
169 } else if (unlikely(!fid_seq_is_mdt0(seq) &&
170 (id + nr) >= OBIF_MAX_OID)) {
171 CERROR("%s:"POSTID" hit the OBIF_MAX_OID (1<<32)!\n",
172 ofd_name(ofd), id, seq);
173 RETURN(rc = -ENOSPC);
176 OBD_ALLOC(batch, nr_saved * sizeof(struct ofd_object *));
180 info->fti_attr.la_valid = LA_TYPE | LA_MODE;
182 * We mark object SUID+SGID to flag it for accepting UID+GID from
183 * client on first write. Currently the permission bits on the OST are
184 * never used, so this is OK.
186 info->fti_attr.la_mode = S_IFREG | S_ISUID | S_ISGID | 0666;
187 info->fti_dof.dof_type = dt_mode_to_dft(S_IFREG);
189 /* Initialize a/c/m time so any client timestamp will always
190 * be newer and update the inode. ctime = 0 is also handled
191 * specially in osd_inode_setattr(). See LU-221, LU-1042 */
192 info->fti_attr.la_valid |= LA_ATIME | LA_MTIME | LA_CTIME;
193 info->fti_attr.la_atime = 0;
194 info->fti_attr.la_mtime = 0;
195 info->fti_attr.la_ctime = 0;
197 /* prepare objects */
198 for (i = 0; i < nr; i++) {
199 info->fti_ostid.oi_id = id + i;
200 info->fti_ostid.oi_seq = seq;
202 rc = fid_ostid_unpack(&info->fti_fid, &info->fti_ostid, 0);
205 GOTO(out, rc = PTR_ERR(fo));
211 fo = ofd_object_find(env, ofd, &info->fti_fid);
214 GOTO(out, rc = PTR_ERR(fo));
220 ofd_write_lock(env, fo);
223 info->fti_buf.lb_buf = &tmp;
224 info->fti_buf.lb_len = sizeof(tmp);
227 th = ofd_trans_create(env, ofd);
229 GOTO(out, rc = PTR_ERR(th));
231 rc = dt_declare_record_write(env, ofd->ofd_lastid_obj[seq],
232 sizeof(tmp), info->fti_off, th);
234 GOTO(trans_stop, rc);
236 for (i = 0; i < nr; i++) {
240 if (unlikely(ofd_object_exists(fo))) {
241 /* object may exist being re-created by write replay */
242 CDEBUG(D_INODE, "object "LPD64"/"LPD64" exists: "
243 DFID"\n", seq, id, PFID(&info->fti_fid));
247 next = ofd_object_child(fo);
248 LASSERT(next != NULL);
250 rc = dt_declare_create(env, next, &info->fti_attr, NULL,
258 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
260 GOTO(trans_stop, rc);
262 CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(&info->fti_fid));
264 for (i = 0; i < nr; i++) {
268 if (likely(!ofd_object_exists(fo))) {
269 next = ofd_object_child(fo);
270 LASSERT(next != NULL);
272 rc = dt_create(env, next, &info->fti_attr, NULL,
276 LASSERT(ofd_object_exists(fo));
278 ofd_last_id_set(ofd, id + i, seq);
283 tmp = cpu_to_le64(ofd_last_id(ofd, seq));
284 rc = dt_record_write(env, ofd->ofd_lastid_obj[seq],
285 &info->fti_buf, &info->fti_off, th);
288 ofd_trans_stop(env, ofd, th, rc);
290 for (i = 0; i < nr_saved; i++) {
293 ofd_write_unlock(env, fo);
294 ofd_object_put(env, fo);
297 OBD_FREE(batch, nr_saved * sizeof(struct ofd_object *));
299 CDEBUG((objects == 0 && rc == 0) ? D_ERROR : D_OTHER,
300 "created %d/%d objects: %d\n", objects, nr_saved, rc);
302 LASSERT(ergo(objects == 0, rc < 0));
303 RETURN(objects > 0 ? objects : rc);
307 * If the object still has SUID+SGID bits set (see ofd_precreate_object()) then
308 * we will accept the UID+GID if sent by the client for initializing the
309 * ownership of this object. We only allow this to happen once (so clear these
310 * bits) and later only allow setattr.
312 int ofd_attr_handle_ugid(const struct lu_env *env, struct ofd_object *fo,
313 struct lu_attr *la, int is_setattr)
315 struct ofd_thread_info *info = ofd_info(env);
316 struct lu_attr *ln = &info->fti_attr2;
322 if (!(la->la_valid & LA_UID) && !(la->la_valid & LA_GID))
325 rc = dt_attr_get(env, ofd_object_child(fo), ln, BYPASS_CAPA);
329 LASSERT(ln->la_valid & LA_MODE);
332 if (!(ln->la_mode & S_ISUID))
333 la->la_valid &= ~LA_UID;
334 if (!(ln->la_mode & S_ISGID))
335 la->la_valid &= ~LA_GID;
338 if ((la->la_valid & LA_UID) && (ln->la_mode & S_ISUID))
340 if ((la->la_valid & LA_GID) && (ln->la_mode & S_ISGID))
343 if (!(la->la_valid & LA_MODE) || !is_setattr) {
344 la->la_mode = ln->la_mode;
345 la->la_valid |= LA_MODE;
347 la->la_mode &= ~mask;
353 int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
354 struct lu_attr *la, struct filter_fid *ff)
356 struct ofd_thread_info *info = ofd_info(env);
357 struct ofd_device *ofd = ofd_obj2dev(fo);
359 struct ofd_mod_data *fmd;
364 ofd_write_lock(env, fo);
365 if (!ofd_object_exists(fo))
366 GOTO(unlock, rc = -ENOENT);
368 if (la->la_valid & (LA_ATIME | LA_MTIME | LA_CTIME)) {
369 fmd = ofd_fmd_get(info->fti_exp, &fo->ofo_header.loh_fid);
370 if (fmd && fmd->fmd_mactime_xid < info->fti_xid)
371 fmd->fmd_mactime_xid = info->fti_xid;
372 ofd_fmd_put(info->fti_exp, fmd);
375 /* VBR: version recovery check */
376 rc = ofd_version_get_check(info, fo);
380 rc = ofd_attr_handle_ugid(env, fo, la, 1 /* is_setattr */);
385 rc = ofd_object_ff_check(env, fo);
392 th = ofd_trans_create(env, ofd);
394 GOTO(unlock, rc = PTR_ERR(th));
396 rc = dt_declare_attr_set(env, ofd_object_child(fo), la, th);
401 info->fti_buf.lb_buf = ff;
402 info->fti_buf.lb_len = sizeof(*ff);
403 rc = dt_declare_xattr_set(env, ofd_object_child(fo),
404 &info->fti_buf, XATTR_NAME_FID, 0,
410 rc = ofd_trans_start(env, ofd, la->la_valid & LA_SIZE ? fo : NULL, th);
414 rc = dt_attr_set(env, ofd_object_child(fo), la, th,
415 ofd_object_capa(env, fo));
420 rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
421 XATTR_NAME_FID, 0, th, BYPASS_CAPA);
424 ofd_trans_stop(env, ofd, th, rc);
426 ofd_write_unlock(env, fo);
430 int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
431 __u64 start, __u64 end, struct lu_attr *la,
432 struct filter_fid *ff)
434 struct ofd_thread_info *info = ofd_info(env);
435 struct ofd_device *ofd = ofd_obj2dev(fo);
436 struct ofd_mod_data *fmd;
437 struct dt_object *dob = ofd_object_child(fo);
444 /* we support truncate, not punch yet */
445 LASSERT(end == OBD_OBJECT_EOF);
447 fmd = ofd_fmd_get(info->fti_exp, &fo->ofo_header.loh_fid);
448 if (fmd && fmd->fmd_mactime_xid < info->fti_xid)
449 fmd->fmd_mactime_xid = info->fti_xid;
450 ofd_fmd_put(info->fti_exp, fmd);
452 ofd_write_lock(env, fo);
453 if (!ofd_object_exists(fo))
454 GOTO(unlock, rc = -ENOENT);
456 /* VBR: version recovery check */
457 rc = ofd_version_get_check(info, fo);
461 rc = ofd_attr_handle_ugid(env, fo, la, 0 /* !is_setattr */);
466 rc = ofd_object_ff_check(env, fo);
473 th = ofd_trans_create(env, ofd);
475 GOTO(unlock, rc = PTR_ERR(th));
477 rc = dt_declare_attr_set(env, dob, la, th);
481 rc = dt_declare_punch(env, dob, start, OBD_OBJECT_EOF, th);
486 info->fti_buf.lb_buf = ff;
487 info->fti_buf.lb_len = sizeof(*ff);
488 rc = dt_declare_xattr_set(env, ofd_object_child(fo),
489 &info->fti_buf, XATTR_NAME_FID, 0,
495 rc = ofd_trans_start(env, ofd, fo, th);
499 rc = dt_punch(env, dob, start, OBD_OBJECT_EOF, th,
500 ofd_object_capa(env, fo));
504 rc = dt_attr_set(env, dob, la, th, ofd_object_capa(env, fo));
509 rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
510 XATTR_NAME_FID, 0, th, BYPASS_CAPA);
513 ofd_trans_stop(env, ofd, th, rc);
515 ofd_write_unlock(env, fo);
519 int ofd_object_destroy(const struct lu_env *env, struct ofd_object *fo,
522 struct ofd_device *ofd = ofd_obj2dev(fo);
528 ofd_write_lock(env, fo);
529 if (!ofd_object_exists(fo))
530 GOTO(unlock, rc = -ENOENT);
532 th = ofd_trans_create(env, ofd);
534 GOTO(unlock, rc = PTR_ERR(th));
536 dt_declare_ref_del(env, ofd_object_child(fo), th);
537 dt_declare_destroy(env, ofd_object_child(fo), th);
539 rc = dt_trans_start_local(env, ofd->ofd_osd, th);
541 rc = ofd_trans_start(env, ofd, NULL, th);
545 ofd_fmd_drop(ofd_info(env)->fti_exp, &fo->ofo_header.loh_fid);
547 dt_ref_del(env, ofd_object_child(fo), th);
548 dt_destroy(env, ofd_object_child(fo), th);
550 ofd_trans_stop(env, ofd, th, rc);
552 ofd_write_unlock(env, fo);
556 int ofd_attr_get(const struct lu_env *env, struct ofd_object *fo,
563 if (ofd_object_exists(fo)) {
564 rc = dt_attr_get(env, ofd_object_child(fo), la,
565 ofd_object_capa(env, fo));
567 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
568 /* Try to correct for a bug in 2.1.0 (LU-221) that caused
569 * negative timestamps to appear to be in the far future,
570 * due old timestamp being stored on disk as an unsigned value.
571 * This fixes up any bad values stored on disk before
572 * returning them to the client, and ensures any timestamp
573 * updates are correct. LU-1042 */
574 if (unlikely(la->la_atime == LU221_BAD_TIME))
576 if (unlikely(la->la_mtime == LU221_BAD_TIME))
578 if (unlikely(la->la_ctime == LU221_BAD_TIME))
581 #warning "remove old LU-221/LU-1042 workaround code"