Whamcloud - gitweb
LU-3811 hsm: handle file ownership and timestamps
[fs/lustre-release.git] / lustre / mdt / mdt_open.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2013, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/mdt/mdt_open.c
37  *
38  * Lustre Metadata Target (mdt) open/close file handling
39  *
40  * Author: Huang Hua <huanghua@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_MDS
44
45 #include <lustre_acl.h>
46 #include <lustre_mds.h>
47 #include "mdt_internal.h"
48
49 /* we do nothing because we do not have refcount now */
50 static void mdt_mfd_get(void *mfdp)
51 {
52 }
53
54 static struct portals_handle_ops mfd_handle_ops = {
55         .hop_addref = mdt_mfd_get,
56         .hop_free   = NULL,
57 };
58
59 /* Create a new mdt_file_data struct, initialize it,
60  * and insert it to global hash table */
61 struct mdt_file_data *mdt_mfd_new(const struct mdt_export_data *med)
62 {
63         struct mdt_file_data *mfd;
64         ENTRY;
65
66         OBD_ALLOC_PTR(mfd);
67         if (mfd != NULL) {
68                 CFS_INIT_LIST_HEAD(&mfd->mfd_handle.h_link);
69                 mfd->mfd_handle.h_owner = med;
70                 CFS_INIT_LIST_HEAD(&mfd->mfd_list);
71                 class_handle_hash(&mfd->mfd_handle, &mfd_handle_ops);
72         }
73
74         RETURN(mfd);
75 }
76
77 /*
78  * Find the mfd pointed to by handle in global hash table.
79  * In case of replay the handle is obsoleted
80  * but mfd can be found in mfd list by that handle
81  */
82 struct mdt_file_data *mdt_handle2mfd(struct mdt_export_data *med,
83                                      const struct lustre_handle *handle,
84                                      bool is_replay_or_resent)
85 {
86         struct mdt_file_data   *mfd;
87         ENTRY;
88
89         LASSERT(handle != NULL);
90         mfd = class_handle2object(handle->cookie, med);
91         /* during dw/setattr replay the mfd can be found by old handle */
92         if (mfd == NULL && is_replay_or_resent) {
93                 cfs_list_for_each_entry(mfd, &med->med_open_head, mfd_list) {
94                         if (mfd->mfd_old_handle.cookie == handle->cookie)
95                                 RETURN(mfd);
96                 }
97                 mfd = NULL;
98         }
99
100         RETURN(mfd);
101 }
102
103 /* free mfd */
104 void mdt_mfd_free(struct mdt_file_data *mfd)
105 {
106         LASSERT(cfs_list_empty(&mfd->mfd_list));
107         OBD_FREE_RCU(mfd, sizeof *mfd, &mfd->mfd_handle);
108 }
109
110 static int mdt_create_data(struct mdt_thread_info *info,
111                            struct mdt_object *p, struct mdt_object *o)
112 {
113         struct md_op_spec     *spec = &info->mti_spec;
114         struct md_attr        *ma   = &info->mti_attr;
115         int                    rc   = 0;
116         ENTRY;
117
118         if (!md_should_create(spec->sp_cr_flags))
119                 RETURN(0);
120
121         ma->ma_need = MA_INODE | MA_LOV;
122         ma->ma_valid = 0;
123         mutex_lock(&o->mot_lov_mutex);
124         if (!(o->mot_flags & MOF_LOV_CREATED)) {
125                 if (p != NULL && (fid_is_obf(mdt_object_fid(p)) ||
126                                   fid_is_dot_lustre(mdt_object_fid(p))))
127                         GOTO(unlock, rc = -EPERM);
128
129                 rc = mdo_create_data(info->mti_env,
130                                      p ? mdt_object_child(p) : NULL,
131                                      mdt_object_child(o), spec, ma);
132                 if (rc == 0)
133                         rc = mdt_attr_get_complex(info, o, ma);
134
135                 if (rc == 0 && ma->ma_valid & MA_LOV)
136                         o->mot_flags |= MOF_LOV_CREATED;
137         }
138 unlock:
139         mutex_unlock(&o->mot_lov_mutex);
140         RETURN(rc);
141 }
142
143 static int mdt_ioepoch_opened(struct mdt_object *mo)
144 {
145         return mo->mot_ioepoch_count;
146 }
147
148 int mdt_object_is_som_enabled(struct mdt_object *mo)
149 {
150         return !mo->mot_ioepoch;
151 }
152
153 /**
154  * Re-enable Size-on-MDS.
155  * Call under ->mot_ioepoch_mutex.
156  */
157 static void mdt_object_som_enable(struct mdt_object *mo, __u64 ioepoch)
158 {
159         if (ioepoch == mo->mot_ioepoch) {
160                 LASSERT(!mdt_ioepoch_opened(mo));
161                 mo->mot_ioepoch = 0;
162                 mo->mot_flags = 0;
163         }
164 }
165
166 /**
167  * Open the IOEpoch. It is allowed if @writecount is not negative.
168  * The epoch and writecount handling is performed under the mot_ioepoch_mutex.
169  */
170 int mdt_ioepoch_open(struct mdt_thread_info *info, struct mdt_object *o,
171                      int created)
172 {
173         struct mdt_device *mdt = info->mti_mdt;
174         int cancel = 0;
175         int rc = 0;
176         ENTRY;
177
178         if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) ||
179             !S_ISREG(lu_object_attr(&o->mot_obj)))
180                 RETURN(0);
181
182         mutex_lock(&o->mot_ioepoch_mutex);
183         if (mdt_ioepoch_opened(o)) {
184                 /* Epoch continues even if there is no writers yet. */
185                 CDEBUG(D_INODE, "continue epoch "LPU64" for "DFID"\n",
186                        o->mot_ioepoch, PFID(mdt_object_fid(o)));
187         } else {
188                 /* XXX: ->mdt_ioepoch is not initialized at the mount */
189                 spin_lock(&mdt->mdt_ioepoch_lock);
190                 if (mdt->mdt_ioepoch < info->mti_replayepoch)
191                         mdt->mdt_ioepoch = info->mti_replayepoch;
192
193                 if (info->mti_replayepoch)
194                         o->mot_ioepoch = info->mti_replayepoch;
195                 else if (++mdt->mdt_ioepoch == IOEPOCH_INVAL)
196                         o->mot_ioepoch = ++mdt->mdt_ioepoch;
197                 else
198                         o->mot_ioepoch = mdt->mdt_ioepoch;
199
200                 spin_unlock(&mdt->mdt_ioepoch_lock);
201
202                 CDEBUG(D_INODE, "starting epoch "LPU64" for "DFID"\n",
203                        o->mot_ioepoch, PFID(mdt_object_fid(o)));
204                 if (created)
205                         o->mot_flags |= MOF_SOM_CREATED;
206                 cancel = 1;
207         }
208         o->mot_ioepoch_count++;
209         mutex_unlock(&o->mot_ioepoch_mutex);
210
211         /* Cancel Size-on-MDS attributes cached on clients for the open case.
212          * In the truncate case, see mdt_reint_setattr(). */
213         if (cancel && (info->mti_rr.rr_fid1 != NULL)) {
214                 struct mdt_lock_handle  *lh = &info->mti_lh[MDT_LH_CHILD];
215                 mdt_lock_reg_init(lh, LCK_EX);
216                 rc = mdt_object_lock(info, o, lh, MDS_INODELOCK_UPDATE,
217                                      MDT_LOCAL_LOCK);
218                 if (rc == 0)
219                         mdt_object_unlock(info, o, lh, 1);
220         }
221         RETURN(rc);
222 }
223
224 /**
225  * Update SOM on-disk attributes.
226  * If enabling, write update inodes and lustre-ea with the proper IOEpoch,
227  * mountid and attributes. If disabling, clean SOM xattr.
228  * Call under ->mot_ioepoch_mutex.
229  */
230 static int mdt_som_attr_set(struct mdt_thread_info *info,
231                             struct mdt_object *obj, __u64 ioepoch, bool enable)
232 {
233         struct md_object        *next = mdt_object_child(obj);
234         int                      rc;
235         ENTRY;
236
237         CDEBUG(D_INODE, "Size-on-MDS attribute %s for epoch "LPU64
238                " on "DFID".\n", enable ? "update" : "disabling",
239                ioepoch, PFID(mdt_object_fid(obj)));
240
241         if (enable) {
242                 struct lu_buf           *buf = &info->mti_buf;
243                 struct som_attrs        *attrs;
244                 struct md_attr          *ma = &info->mti_attr;
245                 struct lu_attr          *la = &ma->ma_attr;
246                 struct obd_device       *obd = info->mti_mdt->mdt_lut.lut_obd;
247
248                 attrs = (struct som_attrs *)info->mti_xattr_buf;
249                 CLASSERT(sizeof(info->mti_xattr_buf) >= sizeof(*attrs));
250
251                 /* pack SOM attributes */
252                 memset(attrs, 0, sizeof(*attrs));
253                 attrs->som_ioepoch = ioepoch;
254                 attrs->som_mountid = obd->u.obt.obt_mount_count;
255                 if ((la->la_valid & LA_SIZE) != 0)
256                         attrs->som_size = la->la_size;
257                 if ((la->la_valid & LA_BLOCKS) != 0)
258                         attrs->som_blocks = la->la_blocks;
259                 lustre_som_swab(attrs);
260
261                 /* update SOM attributes */
262                 buf->lb_buf = attrs;
263                 buf->lb_len = sizeof(*attrs);
264                 rc = mo_xattr_set(info->mti_env, next, buf, XATTR_NAME_SOM, 0);
265         } else {
266                 /* delete SOM attributes */
267                 rc = mo_xattr_del(info->mti_env, next, XATTR_NAME_SOM);
268         }
269
270         RETURN(rc);
271 }
272
273 /** Perform the eviction specific actions on ioepoch close. */
274 static inline int mdt_ioepoch_close_on_eviction(struct mdt_thread_info *info,
275                                                 struct mdt_object *o)
276 {
277         int rc = 0;
278
279         mutex_lock(&o->mot_ioepoch_mutex);
280         CDEBUG(D_INODE, "Eviction. Closing IOepoch "LPU64" on "DFID". "
281                "Count %d\n", o->mot_ioepoch, PFID(mdt_object_fid(o)),
282                o->mot_ioepoch_count);
283         o->mot_ioepoch_count--;
284
285         /* If eviction occured set MOF_SOM_RECOV,
286          * if no other epoch holders, disable SOM on disk. */
287         o->mot_flags |= MOF_SOM_CHANGE | MOF_SOM_RECOV;
288         if (!mdt_ioepoch_opened(o)) {
289                 rc = mdt_som_attr_set(info, o, o->mot_ioepoch, MDT_SOM_DISABLE);
290                 mdt_object_som_enable(o, o->mot_ioepoch);
291         }
292         mutex_unlock(&o->mot_ioepoch_mutex);
293         RETURN(rc);
294 }
295
296 /**
297  * Perform the replay specific actions on ioepoch close.
298  * Skip SOM attribute update if obtained and just forget about the inode state
299  * for the last ioepoch holder. The SOM cache is invalidated on MDS failure.
300  */
301 static inline int mdt_ioepoch_close_on_replay(struct mdt_thread_info *info,
302                                               struct mdt_object *o)
303 {
304         int rc = MDT_IOEPOCH_CLOSED;
305         ENTRY;
306
307         mutex_lock(&o->mot_ioepoch_mutex);
308         CDEBUG(D_INODE, "Replay. Closing epoch "LPU64" on "DFID". Count %d\n",
309                o->mot_ioepoch, PFID(mdt_object_fid(o)), o->mot_ioepoch_count);
310         o->mot_ioepoch_count--;
311
312         /* Get an info from the replayed request if client is supposed
313          * to send an Attibute Update, reconstruct @rc if so */
314         if (info->mti_ioepoch->flags & MF_SOM_AU)
315                 rc = MDT_IOEPOCH_GETATTR;
316
317         if (!mdt_ioepoch_opened(o))
318                 mdt_object_som_enable(o, info->mti_ioepoch->ioepoch);
319         mutex_unlock(&o->mot_ioepoch_mutex);
320
321         RETURN(rc);
322 }
323
324 /**
325  * Regular file IOepoch close.
326  * Closes the ioepoch, checks the object state, apply obtained attributes and
327  * re-enable SOM on the object, if possible. Also checks if the recovery is
328  * needed and packs OBD_MD_FLGETATTRLOCK flag into the reply to force the client
329  * to obtain SOM attributes under the server-side OST locks.
330  *
331  * Return value:
332  * MDT_IOEPOCH_CLOSED if ioepoch is closed.
333  * MDT_IOEPOCH_GETATTR if ioepoch is closed but another SOM update is needed.
334  */
335 static inline int mdt_ioepoch_close_reg(struct mdt_thread_info *info,
336                                         struct mdt_object *o)
337 {
338         struct md_attr *tmp_ma;
339         struct lu_attr *la;
340         int achange, opened;
341         int recovery = 0;
342         int rc = 0, ret = MDT_IOEPOCH_CLOSED;
343         ENTRY;
344
345         la = &info->mti_attr.ma_attr;
346         achange = (info->mti_ioepoch->flags & MF_SOM_CHANGE);
347
348         mutex_lock(&o->mot_ioepoch_mutex);
349         o->mot_ioepoch_count--;
350
351         tmp_ma = &info->mti_u.som.attr;
352         tmp_ma->ma_lmm = info->mti_attr.ma_lmm;
353         tmp_ma->ma_lmm_size = info->mti_attr.ma_lmm_size;
354         tmp_ma->ma_som = &info->mti_u.som.data;
355         tmp_ma->ma_need = MA_INODE | MA_LOV | MA_SOM;
356         tmp_ma->ma_valid = 0;
357         rc = mdt_attr_get_complex(info, o, tmp_ma);
358         if (rc)
359                 GOTO(error_up, rc);
360
361         /* Check the on-disk SOM state. */
362         if (o->mot_flags & MOF_SOM_RECOV)
363                 recovery = 1;
364         else if (!(o->mot_flags & MOF_SOM_CREATED) &&
365                  !(tmp_ma->ma_valid & MA_SOM))
366                 recovery = 1;
367
368         CDEBUG(D_INODE, "Closing epoch "LPU64" on "DFID". Count %d\n",
369                o->mot_ioepoch, PFID(mdt_object_fid(o)), o->mot_ioepoch_count);
370
371         opened = mdt_ioepoch_opened(o);
372         /**
373          * If IOEpoch is not opened, check if a Size-on-MDS update is needed.
374          * Skip the check for file with no LOV  or for unlink files.
375          */
376         if (!opened && tmp_ma->ma_valid & MA_LOV &&
377             !(tmp_ma->ma_valid & MA_INODE && tmp_ma->ma_attr.la_nlink == 0)) {
378                 if (recovery)
379                         /* If some previous writer was evicted, re-ask the
380                          * client for attributes. Even if attributes are
381                          * provided, we cannot believe in them.
382                          * Another use case is that there is no SOM cache on
383                          * disk -- first access with SOM or there was an MDS
384                          * failure. */
385                         ret = MDT_IOEPOCH_GETATTR;
386                 else if (o->mot_flags & MOF_SOM_CHANGE)
387                         /* Some previous writer changed the attribute.
388                          * Do not believe to the current Size-on-MDS
389                          * update, re-ask client. */
390                         ret = MDT_IOEPOCH_GETATTR;
391                 else if (!(la->la_valid & LA_SIZE) && achange)
392                         /* Attributes were changed by the last writer
393                          * only but no Size-on-MDS update is received.*/
394                         ret = MDT_IOEPOCH_GETATTR;
395         }
396
397         if (achange || ret == MDT_IOEPOCH_GETATTR)
398                 o->mot_flags |= MOF_SOM_CHANGE;
399
400         /* If epoch ends and relable SOM attributes are obtained, update them.
401          * Create SOM ea for new files even if there is no attributes obtained
402          * (0-length file). */
403         if (ret == MDT_IOEPOCH_CLOSED && !opened) {
404                 if (achange || o->mot_flags & MOF_SOM_CREATED) {
405                         LASSERT(achange || !(la->la_valid & LA_SIZE));
406                         rc = mdt_som_attr_set(info, o, o->mot_ioepoch,
407                                               MDT_SOM_ENABLE);
408                         /* Avoid the following setattrs of these attributes,
409                          * e.g. for atime update. */
410                         info->mti_attr.ma_valid = 0;
411                 }
412                 mdt_object_som_enable(o, o->mot_ioepoch);
413         }
414
415         mutex_unlock(&o->mot_ioepoch_mutex);
416         /* If recovery is needed, tell the client to perform GETATTR under
417          * the lock. */
418         if (ret == MDT_IOEPOCH_GETATTR && recovery) {
419                 struct mdt_body *rep;
420                 rep = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
421                 rep->valid |= OBD_MD_FLGETATTRLOCK;
422         }
423
424         RETURN(rc ? : ret);
425
426 error_up:
427         mutex_unlock(&o->mot_ioepoch_mutex);
428         return rc;
429 }
430
431 /**
432  * Close IOEpoch (opened file or MDS_FMODE_EPOCH state). It happens if:
433  * - a client closes the IOEpoch;
434  * - a client eviction occured.
435  * Return values:
436  * MDT_IOEPOCH_OPENED if the client does not close IOEpoch.
437  * MDT_IOEPOCH_CLOSED if the client closes IOEpoch.
438  * MDT_IOEPOCH_GETATTR if the client closes IOEpoch but another SOM attribute
439  * update is needed.
440  */
441 static int mdt_ioepoch_close(struct mdt_thread_info *info, struct mdt_object *o)
442 {
443         struct ptlrpc_request *req = mdt_info_req(info);
444         ENTRY;
445
446         if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) ||
447             !S_ISREG(lu_object_attr(&o->mot_obj)))
448                 RETURN(0);
449
450         LASSERT(o->mot_ioepoch_count);
451         LASSERT(info->mti_ioepoch == NULL ||
452                 info->mti_ioepoch->ioepoch == o->mot_ioepoch);
453
454         /* IOEpoch is closed only if client tells about it or eviction occures.
455          * In the replay case, always close the epoch. */
456         if (req == NULL)
457                 RETURN(mdt_ioepoch_close_on_eviction(info, o));
458         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
459                 RETURN(mdt_ioepoch_close_on_replay(info, o));
460         if (info->mti_ioepoch->flags & MF_EPOCH_CLOSE)
461                 RETURN(mdt_ioepoch_close_reg(info, o));
462         /* IO epoch is not closed. */
463         RETURN(MDT_IOEPOCH_OPENED);
464 }
465
466 /**
467  * Close MDS_FMODE_SOM state, when IOEpoch is already closed and we are waiting
468  * for attribute update. It happens if:
469  * - SOM Attribute Update is obtained;
470  * - the client failed to obtain it and informs MDS about it;
471  * - a client eviction occured.
472  * Apply obtained attributes for the 1st case, wipe out the on-disk SOM
473  * cache otherwise.
474  */
475 int mdt_som_au_close(struct mdt_thread_info *info, struct mdt_object *o)
476 {
477         struct ptlrpc_request   *req = mdt_info_req(info);
478         __u64                    ioepoch = 0;
479         int                      act = MDT_SOM_ENABLE;
480         int                      rc = 0;
481         ENTRY;
482
483         LASSERT(!req || info->mti_ioepoch);
484         if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) ||
485             !S_ISREG(lu_object_attr(&o->mot_obj)))
486                 RETURN(0);
487
488         /* No size whereas MF_SOM_CHANGE is set means client failed to
489          * obtain ost attributes, drop the SOM cache on disk if so. */
490         if (!req ||
491             (info->mti_ioepoch &&
492              info->mti_ioepoch->flags & MF_SOM_CHANGE &&
493              !(info->mti_attr.ma_attr.la_valid & LA_SIZE)))
494                 act = MDT_SOM_DISABLE;
495
496         mutex_lock(&o->mot_ioepoch_mutex);
497         /* Mark the object it is the recovery state if we failed to obtain
498          * SOM attributes. */
499         if (act == MDT_SOM_DISABLE)
500                 o->mot_flags |= MOF_SOM_RECOV;
501
502         if (!mdt_ioepoch_opened(o)) {
503                 ioepoch =  info->mti_ioepoch ?
504                         info->mti_ioepoch->ioepoch : o->mot_ioepoch;
505
506                 if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY))
507                         rc = mdt_som_attr_set(info, o, ioepoch, act);
508                 mdt_object_som_enable(o, ioepoch);
509         }
510         mutex_unlock(&o->mot_ioepoch_mutex);
511         RETURN(rc);
512 }
513
514 int mdt_write_read(struct mdt_object *o)
515 {
516         int rc = 0;
517         ENTRY;
518         mutex_lock(&o->mot_ioepoch_mutex);
519         rc = o->mot_writecount;
520         mutex_unlock(&o->mot_ioepoch_mutex);
521         RETURN(rc);
522 }
523
524 int mdt_write_get(struct mdt_object *o)
525 {
526         int rc = 0;
527         ENTRY;
528         mutex_lock(&o->mot_ioepoch_mutex);
529         if (o->mot_writecount < 0)
530                 rc = -ETXTBSY;
531         else
532                 o->mot_writecount++;
533         mutex_unlock(&o->mot_ioepoch_mutex);
534         RETURN(rc);
535 }
536
537 void mdt_write_put(struct mdt_object *o)
538 {
539         ENTRY;
540         mutex_lock(&o->mot_ioepoch_mutex);
541         o->mot_writecount--;
542         mutex_unlock(&o->mot_ioepoch_mutex);
543         EXIT;
544 }
545
546 static int mdt_write_deny(struct mdt_object *o)
547 {
548         int rc = 0;
549         ENTRY;
550         mutex_lock(&o->mot_ioepoch_mutex);
551         if (o->mot_writecount > 0)
552                 rc = -ETXTBSY;
553         else
554                 o->mot_writecount--;
555         mutex_unlock(&o->mot_ioepoch_mutex);
556         RETURN(rc);
557 }
558
559 static void mdt_write_allow(struct mdt_object *o)
560 {
561         ENTRY;
562         mutex_lock(&o->mot_ioepoch_mutex);
563         o->mot_writecount++;
564         mutex_unlock(&o->mot_ioepoch_mutex);
565         EXIT;
566 }
567
568 /* there can be no real transaction so prepare the fake one */
569 static void mdt_empty_transno(struct mdt_thread_info *info, int rc)
570 {
571         struct mdt_device      *mdt = info->mti_mdt;
572         struct ptlrpc_request  *req = mdt_info_req(info);
573         struct tg_export_data  *ted;
574         struct lsd_client_data *lcd;
575
576         ENTRY;
577         /* transaction has occurred already */
578         if (lustre_msg_get_transno(req->rq_repmsg) != 0)
579                 RETURN_EXIT;
580
581         spin_lock(&mdt->mdt_lut.lut_translock);
582         if (rc != 0) {
583                 if (info->mti_transno != 0) {
584                         struct obd_export *exp = req->rq_export;
585
586                         CERROR("%s: replay trans "LPU64" NID %s: rc = %d\n",
587                                mdt_obd_name(mdt), info->mti_transno,
588                                libcfs_nid2str(exp->exp_connection->c_peer.nid),
589                                rc);
590                         spin_unlock(&mdt->mdt_lut.lut_translock);
591                         RETURN_EXIT;
592                 }
593         } else if (info->mti_transno == 0) {
594                 info->mti_transno = ++mdt->mdt_lut.lut_last_transno;
595         } else {
596                 /* should be replay */
597                 if (info->mti_transno > mdt->mdt_lut.lut_last_transno)
598                         mdt->mdt_lut.lut_last_transno = info->mti_transno;
599         }
600         spin_unlock(&mdt->mdt_lut.lut_translock);
601
602         CDEBUG(D_INODE, "transno = "LPU64", last_committed = "LPU64"\n",
603                info->mti_transno,
604                req->rq_export->exp_obd->obd_last_committed);
605
606         req->rq_transno = info->mti_transno;
607         lustre_msg_set_transno(req->rq_repmsg, info->mti_transno);
608
609         /* update lcd in memory only for resent cases */
610         ted = &req->rq_export->exp_target_data;
611         LASSERT(ted);
612         mutex_lock(&ted->ted_lcd_lock);
613         lcd = ted->ted_lcd;
614         if (info->mti_transno < lcd->lcd_last_transno &&
615             info->mti_transno != 0) {
616                 /* This should happen during replay. Do not update
617                  * last rcvd info if replay req transno < last transno,
618                  * otherwise the following resend(after replay) can not
619                  * be checked correctly by xid */
620                 mutex_unlock(&ted->ted_lcd_lock);
621                 CDEBUG(D_HA, "%s: transno = "LPU64" < last_transno = "LPU64"\n",
622                        mdt_obd_name(mdt), info->mti_transno,
623                        lcd->lcd_last_transno);
624                 RETURN_EXIT;
625         }
626
627         if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE ||
628             lustre_msg_get_opc(req->rq_reqmsg) == MDS_DONE_WRITING) {
629                 if (info->mti_transno != 0)
630                         lcd->lcd_last_close_transno = info->mti_transno;
631                 lcd->lcd_last_close_xid = req->rq_xid;
632                 lcd->lcd_last_close_result = rc;
633         } else {
634                 /* VBR: save versions in last_rcvd for reconstruct. */
635                 __u64 *pre_versions = lustre_msg_get_versions(req->rq_repmsg);
636                 if (pre_versions) {
637                         lcd->lcd_pre_versions[0] = pre_versions[0];
638                         lcd->lcd_pre_versions[1] = pre_versions[1];
639                         lcd->lcd_pre_versions[2] = pre_versions[2];
640                         lcd->lcd_pre_versions[3] = pre_versions[3];
641                 }
642                 if (info->mti_transno != 0)
643                         lcd->lcd_last_transno = info->mti_transno;
644
645                 lcd->lcd_last_xid = req->rq_xid;
646                 lcd->lcd_last_result = rc;
647                 lcd->lcd_last_data = info->mti_opdata;
648         }
649         mutex_unlock(&ted->ted_lcd_lock);
650
651         EXIT;
652 }
653
654 void mdt_mfd_set_mode(struct mdt_file_data *mfd, __u64 mode)
655 {
656         LASSERT(mfd != NULL);
657
658         CDEBUG(D_HA, DFID " Change mfd mode "LPO64" -> "LPO64".\n",
659                PFID(mdt_object_fid(mfd->mfd_object)), mfd->mfd_mode, mode);
660
661         mfd->mfd_mode = mode;
662 }
663
664 static int mdt_mfd_open(struct mdt_thread_info *info, struct mdt_object *p,
665                         struct mdt_object *o, __u64 flags, int created)
666 {
667         struct ptlrpc_request   *req = mdt_info_req(info);
668         struct mdt_export_data  *med = &req->rq_export->exp_mdt_data;
669         struct mdt_file_data    *mfd;
670         struct md_attr          *ma  = &info->mti_attr;
671         struct lu_attr          *la  = &ma->ma_attr;
672         struct mdt_body         *repbody;
673         int                      rc = 0, isdir, isreg;
674         ENTRY;
675
676         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
677
678         isreg = S_ISREG(la->la_mode);
679         isdir = S_ISDIR(la->la_mode);
680         if (isreg && !(ma->ma_valid & MA_LOV) && !(flags & MDS_OPEN_RELEASE)) {
681                 /*
682                  * No EA, check whether it is will set regEA and dirEA since in
683                  * above attr get, these size might be zero, so reset it, to
684                  * retrieve the MD after create obj.
685                  */
686                 ma->ma_lmm_size = req_capsule_get_size(info->mti_pill,
687                                                        &RMF_MDT_MD,
688                                                        RCL_SERVER);
689                 /* in replay case, p == NULL */
690                 rc = mdt_create_data(info, p, o);
691                 if (rc)
692                         RETURN(rc);
693         }
694
695         CDEBUG(D_INODE, "after open, ma_valid bit = "LPX64" lmm_size = %d\n",
696                ma->ma_valid, ma->ma_lmm_size);
697
698         if (ma->ma_valid & MA_LOV) {
699                 LASSERT(ma->ma_lmm_size != 0);
700                 repbody->eadatasize = ma->ma_lmm_size;
701                 if (isdir)
702                         repbody->valid |= OBD_MD_FLDIREA;
703                 else
704                         repbody->valid |= OBD_MD_FLEASIZE;
705         }
706
707         if (flags & FMODE_WRITE) {
708                 rc = mdt_write_get(o);
709                 if (rc == 0) {
710                         mdt_ioepoch_open(info, o, created);
711                         repbody->ioepoch = o->mot_ioepoch;
712                 }
713         } else if (flags & MDS_FMODE_EXEC) {
714                 /* if file is released, we can't deny write because we must
715                  * restore (write) it to access it.*/
716                 if ((ma->ma_valid & MA_HSM) &&
717                     (ma->ma_hsm.mh_flags & HS_RELEASED))
718                         rc = 0;
719                 else
720                         rc = mdt_write_deny(o);
721         }
722         if (rc)
723                 RETURN(rc);
724
725         rc = mo_open(info->mti_env, mdt_object_child(o),
726                      created ? flags | MDS_OPEN_CREATED : flags);
727         if (rc)
728                 GOTO(err_out, rc);
729
730         mfd = mdt_mfd_new(med);
731         if (mfd == NULL)
732                 GOTO(err_out, rc = -ENOMEM);
733
734         /*
735          * Keep a reference on this object for this open, and is
736          * released by mdt_mfd_close().
737          */
738         mdt_object_get(info->mti_env, o);
739         mfd->mfd_object = o;
740         mfd->mfd_xid = req->rq_xid;
741
742         /*
743          * @flags is always not zero. At least it should be FMODE_READ,
744          * FMODE_WRITE or MDS_FMODE_EXEC.
745          */
746         LASSERT(flags != 0);
747
748         /* Open handling. */
749         mdt_mfd_set_mode(mfd, flags);
750
751         atomic_inc(&o->mot_open_count);
752         if (flags & MDS_OPEN_LEASE)
753                 atomic_inc(&o->mot_lease_count);
754
755         /* replay handle */
756         if (req_is_replay(req)) {
757                 struct mdt_file_data *old_mfd;
758                 /* Check wheather old cookie already exist in
759                  * the list, becasue when do recovery, client
760                  * might be disconnected from server, and
761                  * restart replay, so there maybe some orphan
762                  * mfd here, we should remove them */
763                 LASSERT(info->mti_rr.rr_handle != NULL);
764                 old_mfd = mdt_handle2mfd(med, info->mti_rr.rr_handle, true);
765                 if (old_mfd != NULL) {
766                         CDEBUG(D_HA, "delete orphan mfd = %p, fid = "DFID", "
767                                "cookie = "LPX64"\n", mfd,
768                                PFID(mdt_object_fid(mfd->mfd_object)),
769                                info->mti_rr.rr_handle->cookie);
770                         spin_lock(&med->med_open_lock);
771                         class_handle_unhash(&old_mfd->mfd_handle);
772                         cfs_list_del_init(&old_mfd->mfd_list);
773                         spin_unlock(&med->med_open_lock);
774                         /* no attr update for that close */
775                         la->la_valid = 0;
776                         ma->ma_valid |= MA_FLAGS;
777                         ma->ma_attr_flags |= MDS_RECOV_OPEN;
778                         mdt_mfd_close(info, old_mfd);
779                         ma->ma_attr_flags &= ~MDS_RECOV_OPEN;
780                         ma->ma_valid &= ~MA_FLAGS;
781                 }
782
783                 CDEBUG(D_HA, "Store old cookie "LPX64" in new mfd\n",
784                        info->mti_rr.rr_handle->cookie);
785
786                 mfd->mfd_old_handle.cookie = info->mti_rr.rr_handle->cookie;
787         }
788
789         repbody->handle.cookie = mfd->mfd_handle.h_cookie;
790
791         if (req->rq_export->exp_disconnected) {
792                 spin_lock(&med->med_open_lock);
793                 class_handle_unhash(&mfd->mfd_handle);
794                 cfs_list_del_init(&mfd->mfd_list);
795                 spin_unlock(&med->med_open_lock);
796                 mdt_mfd_close(info, mfd);
797         } else {
798                 spin_lock(&med->med_open_lock);
799                 cfs_list_add(&mfd->mfd_list, &med->med_open_head);
800                 spin_unlock(&med->med_open_lock);
801         }
802
803         mdt_empty_transno(info, rc);
804
805         RETURN(rc);
806
807 err_out:
808         if (flags & FMODE_WRITE)
809                         /* XXX We also need to close io epoch here.
810                          * See LU-1220 - green */
811                 mdt_write_put(o);
812         else if (flags & FMODE_EXEC)
813                 mdt_write_allow(o);
814         return rc;
815 }
816
817 int mdt_finish_open(struct mdt_thread_info *info,
818                     struct mdt_object *p, struct mdt_object *o,
819                     __u64 flags, int created, struct ldlm_reply *rep)
820 {
821         struct ptlrpc_request   *req = mdt_info_req(info);
822         struct obd_export       *exp = req->rq_export;
823         struct mdt_export_data  *med = &req->rq_export->exp_mdt_data;
824         struct md_attr          *ma  = &info->mti_attr;
825         struct lu_attr          *la  = &ma->ma_attr;
826         struct mdt_file_data    *mfd;
827         struct mdt_body         *repbody;
828         int                      rc = 0;
829         int                      isreg, isdir, islnk;
830         cfs_list_t              *t;
831         ENTRY;
832
833         LASSERT(ma->ma_valid & MA_INODE);
834
835         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
836
837         isreg = S_ISREG(la->la_mode);
838         isdir = S_ISDIR(la->la_mode);
839         islnk = S_ISLNK(la->la_mode);
840         mdt_pack_attr2body(info, repbody, la, mdt_object_fid(o));
841
842         /* LU-2275, simulate broken behaviour (esp. prevalent in
843          * pre-2.4 servers where a very strange reply is sent on error
844          * that looks like it was actually almost succesful and a failure at the
845          * same time */
846         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_NEGATIVE_POSITIVE)) {
847                 mdt_set_disposition(info, rep, DISP_OPEN_OPEN |
848                                                DISP_LOOKUP_NEG |
849                                                DISP_LOOKUP_POS);
850
851                 if (flags & MDS_OPEN_LOCK)
852                         mdt_set_disposition(info, rep, DISP_OPEN_LOCK);
853
854                 RETURN(-ENOENT);
855         }
856
857         if (exp_connect_rmtclient(exp)) {
858                 void *buf = req_capsule_server_get(info->mti_pill, &RMF_ACL);
859
860                 rc = mdt_pack_remote_perm(info, o, buf);
861                 if (rc) {
862                         repbody->valid &= ~OBD_MD_FLRMTPERM;
863                         repbody->aclsize = 0;
864                 } else {
865                         repbody->valid |= OBD_MD_FLRMTPERM;
866                         repbody->aclsize = sizeof(struct mdt_remote_perm);
867                 }
868         }
869 #ifdef CONFIG_FS_POSIX_ACL
870         else if (exp_connect_flags(exp) & OBD_CONNECT_ACL) {
871                 const struct lu_env *env = info->mti_env;
872                 struct md_object *next = mdt_object_child(o);
873                 struct lu_buf *buf = &info->mti_buf;
874
875                 buf->lb_buf = req_capsule_server_get(info->mti_pill, &RMF_ACL);
876                 buf->lb_len = req_capsule_get_size(info->mti_pill, &RMF_ACL,
877                                                    RCL_SERVER);
878                 if (buf->lb_len > 0) {
879                         rc = mo_xattr_get(env, next, buf,
880                                           XATTR_NAME_ACL_ACCESS);
881                         if (rc < 0) {
882                                 if (rc == -ENODATA) {
883                                         repbody->aclsize = 0;
884                                         repbody->valid |= OBD_MD_FLACL;
885                                         rc = 0;
886                                 } else if (rc == -EOPNOTSUPP) {
887                                         rc = 0;
888                                 } else {
889                                         CERROR("got acl size: %d\n", rc);
890                                 }
891                         } else {
892                                 repbody->aclsize = rc;
893                                 repbody->valid |= OBD_MD_FLACL;
894                                 rc = 0;
895                         }
896                 }
897         }
898 #endif
899
900         if (info->mti_mdt->mdt_opts.mo_mds_capa &&
901             exp_connect_flags(exp) & OBD_CONNECT_MDS_CAPA) {
902                 struct lustre_capa *capa;
903
904                 capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA1);
905                 LASSERT(capa);
906                 capa->lc_opc = CAPA_OPC_MDS_DEFAULT;
907                 rc = mo_capa_get(info->mti_env, mdt_object_child(o), capa, 0);
908                 if (rc)
909                         RETURN(rc);
910                 repbody->valid |= OBD_MD_FLMDSCAPA;
911         }
912
913         if (info->mti_mdt->mdt_opts.mo_oss_capa &&
914             exp_connect_flags(exp) & OBD_CONNECT_OSS_CAPA &&
915             S_ISREG(lu_object_attr(&o->mot_obj))) {
916                 struct lustre_capa *capa;
917
918                 capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA2);
919                 LASSERT(capa);
920                 capa->lc_opc = CAPA_OPC_OSS_DEFAULT | capa_open_opc(flags);
921                 rc = mo_capa_get(info->mti_env, mdt_object_child(o), capa, 0);
922                 if (rc)
923                         RETURN(rc);
924                 repbody->valid |= OBD_MD_FLOSSCAPA;
925         }
926
927         /*
928          * If we are following a symlink, don't open; and do not return open
929          * handle for special nodes as client required.
930          */
931         if (islnk || (!isreg && !isdir &&
932             (exp_connect_flags(req->rq_export) & OBD_CONNECT_NODEVOH))) {
933                 lustre_msg_set_transno(req->rq_repmsg, 0);
934                 RETURN(0);
935         }
936
937         /*
938          * We need to return the existing object's fid back, so it is done here,
939          * after preparing the reply.
940          */
941         if (!created && (flags & MDS_OPEN_EXCL) && (flags & MDS_OPEN_CREAT))
942                 RETURN(-EEXIST);
943
944         /* This can't be done earlier, we need to return reply body */
945         if (isdir) {
946                 if (flags & (MDS_OPEN_CREAT | FMODE_WRITE)) {
947                         /* We are trying to create or write an existing dir. */
948                         RETURN(-EISDIR);
949                 }
950         } else if (flags & MDS_OPEN_DIRECTORY)
951                 RETURN(-ENOTDIR);
952
953         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_OPEN_CREATE,
954                                  OBD_FAIL_LDLM_REPLY | OBD_FAIL_ONCE)) {
955                 RETURN(-EAGAIN);
956         }
957
958         mfd = NULL;
959         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
960                 spin_lock(&med->med_open_lock);
961                 cfs_list_for_each(t, &med->med_open_head) {
962                         mfd = cfs_list_entry(t, struct mdt_file_data, mfd_list);
963                         if (mfd->mfd_xid == req->rq_xid)
964                                 break;
965                         mfd = NULL;
966                 }
967                 spin_unlock(&med->med_open_lock);
968
969                 if (mfd != NULL) {
970                         repbody->handle.cookie = mfd->mfd_handle.h_cookie;
971                         /*set repbody->ea_size for resent case*/
972                         if (ma->ma_valid & MA_LOV) {
973                                 LASSERT(ma->ma_lmm_size != 0);
974                                 repbody->eadatasize = ma->ma_lmm_size;
975                                 if (isdir)
976                                         repbody->valid |= OBD_MD_FLDIREA;
977                                 else
978                                         repbody->valid |= OBD_MD_FLEASIZE;
979                         }
980                         mdt_set_disposition(info, rep, DISP_OPEN_OPEN);
981                         RETURN(0);
982                 }
983         }
984
985         rc = mdt_mfd_open(info, p, o, flags, created);
986         if (!rc)
987                 mdt_set_disposition(info, rep, DISP_OPEN_OPEN);
988
989         RETURN(rc);
990 }
991
992 extern void mdt_req_from_lcd(struct ptlrpc_request *req,
993                              struct lsd_client_data *lcd);
994
995 void mdt_reconstruct_open(struct mdt_thread_info *info,
996                           struct mdt_lock_handle *lhc)
997 {
998         const struct lu_env *env = info->mti_env;
999         struct mdt_device       *mdt  = info->mti_mdt;
1000         struct req_capsule      *pill = info->mti_pill;
1001         struct ptlrpc_request   *req  = mdt_info_req(info);
1002         struct tg_export_data   *ted  = &req->rq_export->exp_target_data;
1003         struct lsd_client_data  *lcd  = ted->ted_lcd;
1004         struct md_attr          *ma   = &info->mti_attr;
1005         struct mdt_reint_record *rr   = &info->mti_rr;
1006         __u64                   flags = info->mti_spec.sp_cr_flags;
1007         struct ldlm_reply       *ldlm_rep;
1008         struct mdt_object       *parent;
1009         struct mdt_object       *child;
1010         struct mdt_body         *repbody;
1011         int                      rc;
1012         ENTRY;
1013
1014         LASSERT(pill->rc_fmt == &RQF_LDLM_INTENT_OPEN);
1015         ldlm_rep = req_capsule_server_get(pill, &RMF_DLM_REP);
1016         repbody = req_capsule_server_get(pill, &RMF_MDT_BODY);
1017
1018         ma->ma_lmm = req_capsule_server_get(pill, &RMF_MDT_MD);
1019         ma->ma_lmm_size = req_capsule_get_size(pill, &RMF_MDT_MD,
1020                                                RCL_SERVER);
1021         ma->ma_need = MA_INODE | MA_HSM;
1022         if (ma->ma_lmm_size > 0)
1023                 ma->ma_need |= MA_LOV;
1024
1025         ma->ma_valid = 0;
1026
1027         mdt_req_from_lcd(req, lcd);
1028         mdt_set_disposition(info, ldlm_rep, lcd->lcd_last_data);
1029
1030         CDEBUG(D_INODE, "This is reconstruct open: disp="LPX64", result=%d\n",
1031                ldlm_rep->lock_policy_res1, req->rq_status);
1032
1033         if (mdt_get_disposition(ldlm_rep, DISP_OPEN_CREATE) &&
1034             req->rq_status != 0)
1035                 /* We did not create successfully, return error to client. */
1036                 GOTO(out, rc = req->rq_status);
1037
1038         if (mdt_get_disposition(ldlm_rep, DISP_OPEN_CREATE)) {
1039                 struct obd_export *exp = req->rq_export;
1040                 /*
1041                  * We failed after creation, but we do not know in which step
1042                  * we failed. So try to check the child object.
1043                  */
1044                 parent = mdt_object_find(env, mdt, rr->rr_fid1);
1045                 if (IS_ERR(parent)) {
1046                         rc = PTR_ERR(parent);
1047                         LCONSOLE_WARN("Parent "DFID" lookup error %d."
1048                                       " Evicting client %s with export %s.\n",
1049                                       PFID(rr->rr_fid1), rc,
1050                                       obd_uuid2str(&exp->exp_client_uuid),
1051                                       obd_export_nid2str(exp));
1052                         mdt_export_evict(exp);
1053                         RETURN_EXIT;
1054                 }
1055                 child = mdt_object_find(env, mdt, rr->rr_fid2);
1056                 if (IS_ERR(child)) {
1057                         rc = PTR_ERR(child);
1058                         LCONSOLE_WARN("Child "DFID" lookup error %d."
1059                                       " Evicting client %s with export %s.\n",
1060                                       PFID(mdt_object_fid(child)), rc,
1061                                       obd_uuid2str(&exp->exp_client_uuid),
1062                                       obd_export_nid2str(exp));
1063                         mdt_object_put(env, parent);
1064                         mdt_export_evict(exp);
1065                         RETURN_EXIT;
1066                 }
1067
1068                 if (unlikely(mdt_object_remote(child))) {
1069                         /* the child object was created on remote server */
1070                         if (!mdt_is_dne_client(exp)) {
1071                                 /* Return -EIO for old client */
1072                                 mdt_object_put(env, parent);
1073                                 mdt_object_put(env, child);
1074                                 GOTO(out, rc = -EIO);
1075                         }
1076                         repbody->fid1 = *rr->rr_fid2;
1077                         repbody->valid |= (OBD_MD_FLID | OBD_MD_MDS);
1078                         rc = 0;
1079                 } else {
1080                         if (mdt_object_exists(child)) {
1081                                 mdt_set_capainfo(info, 1, rr->rr_fid2,
1082                                                  BYPASS_CAPA);
1083                                 rc = mdt_attr_get_complex(info, child, ma);
1084                                 if (rc == 0)
1085                                         rc = mdt_finish_open(info, parent,
1086                                                              child, flags,
1087                                                              1, ldlm_rep);
1088                         } else {
1089                                 /* the child does not exist, we should do
1090                                  * regular open */
1091                                 mdt_object_put(env, parent);
1092                                 mdt_object_put(env, child);
1093                                 GOTO(regular_open, 0);
1094                         }
1095                 }
1096                 mdt_object_put(env, parent);
1097                 mdt_object_put(env, child);
1098                 GOTO(out, rc);
1099         } else {
1100 regular_open:
1101                 /* We did not try to create, so we are a pure open */
1102                 rc = mdt_reint_open(info, lhc);
1103         }
1104
1105         EXIT;
1106 out:
1107         req->rq_status = rc;
1108         lustre_msg_set_status(req->rq_repmsg, req->rq_status);
1109         LASSERT(ergo(rc < 0, lustre_msg_get_transno(req->rq_repmsg) == 0));
1110 }
1111
1112 int mdt_open_by_fid(struct mdt_thread_info *info, struct ldlm_reply *rep)
1113 {
1114         __u64                    flags = info->mti_spec.sp_cr_flags;
1115         struct mdt_reint_record *rr = &info->mti_rr;
1116         struct md_attr          *ma = &info->mti_attr;
1117         struct mdt_object       *o;
1118         int                      rc;
1119         ENTRY;
1120
1121         o = mdt_object_find(info->mti_env, info->mti_mdt, rr->rr_fid2);
1122         if (IS_ERR(o))
1123                 RETURN(rc = PTR_ERR(o));
1124
1125         if (unlikely(mdt_object_remote(o))) {
1126                 /* the child object was created on remote server */
1127                 struct mdt_body *repbody;
1128
1129                 mdt_set_disposition(info, rep, (DISP_IT_EXECD |
1130                                                 DISP_LOOKUP_EXECD |
1131                                                 DISP_LOOKUP_POS));
1132                 repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
1133                 repbody->fid1 = *rr->rr_fid2;
1134                 repbody->valid |= (OBD_MD_FLID | OBD_MD_MDS);
1135                 rc = 0;
1136         } else {
1137                 if (mdt_object_exists(o)) {
1138                         mdt_set_disposition(info, rep, (DISP_IT_EXECD |
1139                                                         DISP_LOOKUP_EXECD |
1140                                                         DISP_LOOKUP_POS));
1141
1142                         rc = mdt_attr_get_complex(info, o, ma);
1143                         if (rc == 0)
1144                                 rc = mdt_finish_open(info, NULL, o, flags, 0,
1145                                                      rep);
1146                 } else {
1147                         rc = -ENOENT;
1148                 }
1149         }
1150
1151         mdt_object_put(info->mti_env, o);
1152         RETURN(rc);
1153 }
1154
1155 /* lock object for open */
1156 static int mdt_object_open_lock(struct mdt_thread_info *info,
1157                                 struct mdt_object *obj,
1158                                 struct mdt_lock_handle *lhc,
1159                                 __u64 *ibits)
1160 {
1161         struct md_attr  *ma = &info->mti_attr;
1162         __u64            open_flags = info->mti_spec.sp_cr_flags;
1163         ldlm_mode_t      lm = LCK_CR;
1164         bool             acq_lease = !!(open_flags & MDS_OPEN_LEASE);
1165         bool             try_layout = false;
1166         bool             create_layout = false;
1167         int              rc = 0;
1168         ENTRY;
1169
1170         *ibits = 0;
1171         mdt_lock_handle_init(lhc);
1172
1173         if (req_is_replay(mdt_info_req(info)))
1174                 RETURN(0);
1175
1176         if (S_ISREG(lu_object_attr(&obj->mot_obj))) {
1177                 if (ma->ma_need & MA_LOV && !(ma->ma_valid & MA_LOV) &&
1178                     md_should_create(open_flags))
1179                         create_layout = true;
1180                 if (exp_connect_layout(info->mti_exp) && !create_layout &&
1181                     ma->ma_need & MA_LOV)
1182                         try_layout = true;
1183         }
1184
1185         if (acq_lease) {
1186                 /* lease open, acquire write mode of open sem */
1187                 down_write(&obj->mot_open_sem);
1188
1189                 /* Lease exists and ask for new lease */
1190                 if (atomic_read(&obj->mot_lease_count) > 0) {
1191                         /* only exclusive open is supported, so lease
1192                          * are conflicted to each other */
1193                         GOTO(out, rc = -EBUSY);
1194                 }
1195
1196                 /* Lease must be with open lock */
1197                 if (!(open_flags & MDS_OPEN_LOCK)) {
1198                         CERROR("Request lease for file:"DFID ", but open lock "
1199                                 "is missed, open_flags = "LPO64".\n",
1200                                 PFID(mdt_object_fid(obj)), open_flags);
1201                         GOTO(out, rc = -EPROTO);
1202                 }
1203
1204                 /* XXX: only exclusive open is supported. */
1205                 lm = LCK_EX;
1206                 *ibits = MDS_INODELOCK_OPEN;
1207
1208                 /* never grant LCK_EX layout lock to client */
1209                 try_layout = false;
1210         } else { /* normal open */
1211                 /* normal open holds read mode of open sem */
1212                 down_read(&obj->mot_open_sem);
1213
1214                 if (open_flags & MDS_OPEN_LOCK) {
1215                         if (open_flags & FMODE_WRITE)
1216                                 lm = LCK_CW;
1217                         /* if file is released, we can't deny write because we must
1218                          * restore (write) it to access it. */
1219                         else if ((open_flags & MDS_FMODE_EXEC) &&
1220                                  !((ma->ma_valid & MA_HSM) &&
1221                                    (ma->ma_hsm.mh_flags & HS_RELEASED)))
1222                                 lm = LCK_PR;
1223                         else
1224                                 lm = LCK_CR;
1225
1226                         *ibits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_OPEN;
1227                 } else if (atomic_read(&obj->mot_lease_count) > 0) {
1228                         if (open_flags & FMODE_WRITE)
1229                                 lm = LCK_CW;
1230                         else
1231                                 lm = LCK_CR;
1232
1233                         /* revoke lease */
1234                         *ibits = MDS_INODELOCK_OPEN;
1235                         try_layout = false;
1236
1237                         lhc = &info->mti_lh[MDT_LH_LOCAL];
1238                 }
1239                 CDEBUG(D_INODE, "normal open:"DFID" lease count: %d, lm: %d\n",
1240                         PFID(mdt_object_fid(obj)),
1241                         atomic_read(&obj->mot_open_count), lm);
1242         }
1243
1244         mdt_lock_reg_init(lhc, lm);
1245
1246         /* one problem to return layout lock on open is that it may result
1247          * in too many layout locks cached on the client side. */
1248         if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_NO_LL_OPEN) && try_layout) {
1249                 /* return lookup lock to validate inode at the client side,
1250                  * this is pretty important otherwise mdt will return layout
1251                  * lock for each open.
1252                  * However this is a double-edged sword because changing
1253                  * permission will revoke huge # of LOOKUP locks. */
1254                 *ibits |= MDS_INODELOCK_LAYOUT | MDS_INODELOCK_LOOKUP;
1255                 if (!mdt_object_lock_try(info, obj, lhc, *ibits,
1256                                          MDT_CROSS_LOCK)) {
1257                         *ibits &= ~(MDS_INODELOCK_LAYOUT|MDS_INODELOCK_LOOKUP);
1258                         if (*ibits != 0)
1259                                 rc = mdt_object_lock(info, obj, lhc, *ibits,
1260                                                 MDT_CROSS_LOCK);
1261                 }
1262         } else if (*ibits != 0) {
1263                 rc = mdt_object_lock(info, obj, lhc, *ibits, MDT_CROSS_LOCK);
1264         }
1265
1266         CDEBUG(D_INODE, "Requested bits lock:"DFID ", ibits = "LPX64
1267                 ", open_flags = "LPO64", try_layout = %d, rc = %d\n",
1268                 PFID(mdt_object_fid(obj)), *ibits, open_flags, try_layout, rc);
1269
1270         /* will change layout, revoke layout locks by enqueuing EX lock. */
1271         if (rc == 0 && create_layout) {
1272                 struct mdt_lock_handle *ll = &info->mti_lh[MDT_LH_LAYOUT];
1273
1274                 CDEBUG(D_INODE, "Will create layout, get EX layout lock:"DFID
1275                         ", open_flags = "LPO64"\n",
1276                         PFID(mdt_object_fid(obj)), open_flags);
1277
1278                 LASSERT(!try_layout);
1279                 mdt_lock_handle_init(ll);
1280                 mdt_lock_reg_init(ll, LCK_EX);
1281                 rc = mdt_object_lock(info, obj, ll, MDS_INODELOCK_LAYOUT,
1282                                         MDT_LOCAL_LOCK);
1283
1284                 OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LL_BLOCK, 2);
1285         }
1286
1287         /* Check if there is any other open handles after acquiring
1288          * open lock. At this point, caching open handles have been revoked
1289          * by open lock.
1290          * XXX: Now only exclusive open is supported. Need to check the
1291          * type of open for generic lease support. */
1292         if (rc == 0 && acq_lease) {
1293                 struct ptlrpc_request *req = mdt_info_req(info);
1294                 struct mdt_export_data *med = &req->rq_export->exp_mdt_data;
1295                 struct mdt_file_data *mfd;
1296                 bool is_replay_or_resent;
1297                 int open_count = 0;
1298
1299                 /* For lease: application can open a file and then apply lease,
1300                  * @handle contains original open handle in that case.
1301                  * In recovery, open REQ will be replayed and the lease REQ may
1302                  * be resent that means the open handle is already stale, so we
1303                  * need to fix it up here by finding new handle. */
1304                 is_replay_or_resent = req_is_replay(req) ||
1305                         lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT;
1306
1307                 /* if the request is _not_ a replay request, rr_handle
1308                  * may be used to hold an openhandle which is issuing the
1309                  * lease request, so that this openhandle doesn't count. */
1310                 mfd = mdt_handle2mfd(med, info->mti_rr.rr_handle,
1311                                      is_replay_or_resent);
1312                 if (mfd != NULL)
1313                         ++open_count;
1314
1315                 CDEBUG(D_INODE, "acq_lease "DFID": openers: %d, want: %d\n",
1316                         PFID(mdt_object_fid(obj)),
1317                         atomic_read(&obj->mot_open_count), open_count);
1318
1319                 if (atomic_read(&obj->mot_open_count) > open_count)
1320                         GOTO(out, rc = -EBUSY);
1321         }
1322         GOTO(out, rc);
1323
1324 out:
1325         RETURN(rc);
1326 }
1327
1328 static void mdt_object_open_unlock(struct mdt_thread_info *info,
1329                                    struct mdt_object *obj,
1330                                    struct mdt_lock_handle *lhc,
1331                                    __u64 ibits, int rc)
1332 {
1333         __u64 open_flags = info->mti_spec.sp_cr_flags;
1334         struct mdt_lock_handle *ll = &info->mti_lh[MDT_LH_LOCAL];
1335         ENTRY;
1336
1337         if (req_is_replay(mdt_info_req(info)))
1338                 RETURN_EXIT;
1339
1340         /* Release local lock - the lock put in MDT_LH_LOCAL will never
1341          * return to client side. */
1342         if (lustre_handle_is_used(&ll->mlh_reg_lh))
1343                 mdt_object_unlock(info, obj, ll, 1);
1344
1345         ll = &info->mti_lh[MDT_LH_LAYOUT];
1346         /* Release local layout lock, layout was created */
1347         if (lustre_handle_is_used(&ll->mlh_reg_lh)) {
1348                 LASSERT(!(ibits & MDS_INODELOCK_LAYOUT));
1349                 mdt_object_unlock(info, obj, ll, 1);
1350         }
1351
1352         if (open_flags & MDS_OPEN_LEASE)
1353                 up_write(&obj->mot_open_sem);
1354         else
1355                 up_read(&obj->mot_open_sem);
1356
1357         /* Cross-ref case, the lock should be returned to the client */
1358         if (ibits == 0 || rc == -EREMOTE)
1359                 RETURN_EXIT;
1360
1361         if (!(open_flags & MDS_OPEN_LOCK) && !(ibits & MDS_INODELOCK_LAYOUT)) {
1362                 /* for the open request, the lock will only return to client
1363                  * if open or layout lock is granted. */
1364                 rc = 1;
1365         }
1366
1367         if (rc != 0) {
1368                 struct ldlm_reply       *ldlm_rep;
1369
1370                 ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
1371                 mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_LOCK);
1372                 mdt_object_unlock(info, obj, lhc, 1);
1373         }
1374         RETURN_EXIT;
1375 }
1376
1377 /**
1378  * Check release is permitted for the current HSM flags.
1379  */
1380 static bool mdt_hsm_release_allow(const struct md_attr *ma)
1381 {
1382         if (!(ma->ma_valid & MA_HSM))
1383                 return false;
1384
1385         if (ma->ma_hsm.mh_flags & (HS_DIRTY|HS_NORELEASE|HS_LOST))
1386                 return false;
1387
1388         if (!(ma->ma_hsm.mh_flags & HS_ARCHIVED))
1389                 return false;
1390
1391         return true;
1392 }
1393
1394 int mdt_open_by_fid_lock(struct mdt_thread_info *info, struct ldlm_reply *rep,
1395                          struct mdt_lock_handle *lhc)
1396 {
1397         const struct lu_env     *env   = info->mti_env;
1398         struct mdt_device       *mdt   = info->mti_mdt;
1399         __u64                    flags = info->mti_spec.sp_cr_flags;
1400         struct mdt_reint_record *rr    = &info->mti_rr;
1401         struct md_attr          *ma    = &info->mti_attr;
1402         struct mdt_object       *parent= NULL;
1403         struct mdt_object       *o;
1404         int                      rc;
1405         __u64                    ibits = 0;
1406         ENTRY;
1407
1408         if (md_should_create(flags) && !(flags & MDS_OPEN_HAS_EA)) {
1409                 if (!lu_fid_eq(rr->rr_fid1, rr->rr_fid2)) {
1410                         parent = mdt_object_find(env, mdt, rr->rr_fid1);
1411                         if (IS_ERR(parent)) {
1412                                 CDEBUG(D_INODE, "Fail to find parent "DFID
1413                                        " for anonymous created %ld, try to"
1414                                        " use server-side parent.\n",
1415                                        PFID(rr->rr_fid1), PTR_ERR(parent));
1416                                 parent = NULL;
1417                         }
1418                 }
1419                 if (parent == NULL)
1420                         ma->ma_need |= MA_PFID;
1421         }
1422
1423         o = mdt_object_find(env, mdt, rr->rr_fid2);
1424         if (IS_ERR(o))
1425                 RETURN(rc = PTR_ERR(o));
1426
1427         if (mdt_object_remote(o)) {
1428                 CDEBUG(D_INFO, "%s: "DFID" is on remote MDT.\n",
1429                        mdt_obd_name(info->mti_mdt),
1430                        PFID(rr->rr_fid2));
1431                 GOTO(out, rc = -EREMOTE);
1432         } else if (!mdt_object_exists(o)) {
1433                 mdt_set_disposition(info, rep,
1434                                     DISP_IT_EXECD |
1435                                     DISP_LOOKUP_EXECD |
1436                                     DISP_LOOKUP_NEG);
1437                 GOTO(out, rc = -ENOENT);
1438         }
1439
1440         mdt_set_disposition(info, rep, (DISP_IT_EXECD | DISP_LOOKUP_EXECD));
1441
1442         if (flags & MDS_OPEN_RELEASE)
1443                 ma->ma_need |= MA_HSM;
1444         rc = mdt_attr_get_complex(info, o, ma);
1445         if (rc)
1446                 GOTO(out, rc);
1447
1448         /* If a release request, check file flags are fine and ask for an
1449          * exclusive open access. */
1450         if (flags & MDS_OPEN_RELEASE && !mdt_hsm_release_allow(ma))
1451                 GOTO(out, rc = -EPERM);
1452
1453         rc = mdt_object_open_lock(info, o, lhc, &ibits);
1454         if (rc)
1455                 GOTO(out_unlock, rc);
1456
1457         if (ma->ma_valid & MA_PFID) {
1458                 parent = mdt_object_find(env, mdt, &ma->ma_pfid);
1459                 if (IS_ERR(parent)) {
1460                         CDEBUG(D_INODE, "Fail to find parent "DFID
1461                                " for anonymous created %ld, try to"
1462                                " use system default.\n",
1463                                PFID(&ma->ma_pfid), PTR_ERR(parent));
1464                         parent = NULL;
1465                 }
1466         }
1467
1468         rc = mdt_finish_open(info, parent, o, flags, 0, rep);
1469         if (!rc) {
1470                 mdt_set_disposition(info, rep, DISP_LOOKUP_POS);
1471                 if (flags & MDS_OPEN_LOCK)
1472                         mdt_set_disposition(info, rep, DISP_OPEN_LOCK);
1473                 if (flags & MDS_OPEN_LEASE)
1474                         mdt_set_disposition(info, rep, DISP_OPEN_LEASE);
1475         }
1476         GOTO(out_unlock, rc);
1477
1478 out_unlock:
1479         mdt_object_open_unlock(info, o, lhc, ibits, rc);
1480 out:
1481         mdt_object_put(env, o);
1482         if (parent != NULL)
1483                 mdt_object_put(env, parent);
1484         return rc;
1485 }
1486
1487 int mdt_pin(struct mdt_thread_info* info)
1488 {
1489         ENTRY;
1490         RETURN(err_serious(-EOPNOTSUPP));
1491 }
1492
1493 /* Cross-ref request. Currently it can only be a pure open (w/o create) */
1494 static int mdt_cross_open(struct mdt_thread_info *info,
1495                           const struct lu_fid *parent_fid,
1496                           const struct lu_fid *fid,
1497                           struct ldlm_reply *rep, __u32 flags)
1498 {
1499         struct md_attr    *ma = &info->mti_attr;
1500         struct mdt_object *o;
1501         int                rc;
1502         ENTRY;
1503
1504         o = mdt_object_find(info->mti_env, info->mti_mdt, fid);
1505         if (IS_ERR(o))
1506                 RETURN(rc = PTR_ERR(o));
1507
1508         if (mdt_object_remote(o)) {
1509                 /* Something is wrong here, the object is on another MDS! */
1510                 CERROR("%s: "DFID" isn't on this server!: rc = %d\n",
1511                        mdt_obd_name(info->mti_mdt), PFID(fid), -EFAULT);
1512                 LU_OBJECT_DEBUG(D_WARNING, info->mti_env,
1513                                 &o->mot_obj,
1514                                 "Object isn't on this server! FLD error?\n");
1515                 rc = -EFAULT;
1516         } else {
1517                 if (mdt_object_exists(o)) {
1518                         /* Do permission check for cross-open. */
1519                         rc = mo_permission(info->mti_env, NULL,
1520                                            mdt_object_child(o),
1521                                            NULL, flags | MDS_OPEN_CROSS);
1522                         if (rc)
1523                                 goto out;
1524
1525                         mdt_set_capainfo(info, 0, fid, BYPASS_CAPA);
1526                         rc = mdt_attr_get_complex(info, o, ma);
1527                         if (rc != 0)
1528                                 GOTO(out, rc);
1529
1530                         /* Do not create lov object if the fid is opened
1531                          * under OBF */
1532                         if (S_ISREG(ma->ma_attr.la_mode) &&
1533                             !(ma->ma_valid & MA_LOV) && (flags & FMODE_WRITE) &&
1534                             fid_is_obf(parent_fid))
1535                                 GOTO(out, rc = -EPERM);
1536
1537                         rc = mdt_finish_open(info, NULL, o, flags, 0, rep);
1538                 } else {
1539                         /*
1540                          * Something is wrong here. lookup was positive but
1541                          * there is no object!
1542                          */
1543                         CERROR("%s: "DFID" doesn't exist!: rc = %d\n",
1544                               mdt_obd_name(info->mti_mdt), PFID(fid), -EFAULT);
1545                         rc = -EFAULT;
1546                 }
1547         }
1548 out:
1549         mdt_object_put(info->mti_env, o);
1550         RETURN(rc);
1551 }
1552
1553 int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc)
1554 {
1555         struct mdt_device       *mdt = info->mti_mdt;
1556         struct ptlrpc_request   *req = mdt_info_req(info);
1557         struct mdt_object       *parent;
1558         struct mdt_object       *child;
1559         struct mdt_lock_handle  *lh;
1560         struct ldlm_reply       *ldlm_rep;
1561         struct mdt_body         *repbody;
1562         struct lu_fid           *child_fid = &info->mti_tmp_fid1;
1563         struct md_attr          *ma = &info->mti_attr;
1564         __u64                    create_flags = info->mti_spec.sp_cr_flags;
1565         __u64                    ibits = 0;
1566         struct mdt_reint_record *rr = &info->mti_rr;
1567         struct lu_name          *lname;
1568         int                      result, rc;
1569         int                      created = 0;
1570         __u32                    msg_flags;
1571         ENTRY;
1572
1573         OBD_FAIL_TIMEOUT_ORSET(OBD_FAIL_MDS_PAUSE_OPEN, OBD_FAIL_ONCE,
1574                                (obd_timeout + 1) / 4);
1575
1576         mdt_counter_incr(req, LPROC_MDT_OPEN);
1577         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
1578
1579         ma->ma_lmm = req_capsule_server_get(info->mti_pill, &RMF_MDT_MD);
1580         ma->ma_lmm_size = req_capsule_get_size(info->mti_pill, &RMF_MDT_MD,
1581                                                RCL_SERVER);
1582         ma->ma_need = MA_INODE;
1583         if (ma->ma_lmm_size > 0)
1584                 ma->ma_need |= MA_LOV;
1585
1586         ma->ma_valid = 0;
1587
1588         LASSERT(info->mti_pill->rc_fmt == &RQF_LDLM_INTENT_OPEN);
1589         ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
1590
1591         if (unlikely(create_flags & MDS_OPEN_JOIN_FILE)) {
1592                 CERROR("file join is not supported anymore.\n");
1593                 GOTO(out, result = err_serious(-EOPNOTSUPP));
1594         }
1595         msg_flags = lustre_msg_get_flags(req->rq_reqmsg);
1596
1597         if ((create_flags & (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS)) &&
1598             info->mti_spec.u.sp_ea.eadata == NULL)
1599                 GOTO(out, result = err_serious(-EINVAL));
1600
1601         CDEBUG(D_INODE, "I am going to open "DFID"/(%s->"DFID") "
1602                "cr_flag="LPO64" mode=0%06o msg_flag=0x%x\n",
1603                PFID(rr->rr_fid1), rr->rr_name,
1604                PFID(rr->rr_fid2), create_flags,
1605                ma->ma_attr.la_mode, msg_flags);
1606         if (info->mti_cross_ref) {
1607                 /* This is cross-ref open */
1608                 mdt_set_disposition(info, ldlm_rep,
1609                             (DISP_IT_EXECD | DISP_LOOKUP_EXECD |
1610                              DISP_LOOKUP_POS));
1611                 result = mdt_cross_open(info, rr->rr_fid2, rr->rr_fid1,
1612                                         ldlm_rep, create_flags);
1613                 GOTO(out, result);
1614         } else if (req_is_replay(req) ||
1615             (req->rq_export->exp_libclient && create_flags & MDS_OPEN_HAS_EA)) {
1616                 /* This is a replay request or from liblustre with ea. */
1617                 result = mdt_open_by_fid(info, ldlm_rep);
1618
1619                 if (result != -ENOENT) {
1620                         if (req->rq_export->exp_libclient &&
1621                             create_flags & MDS_OPEN_HAS_EA)
1622                                 GOTO(out, result = 0);
1623                         GOTO(out, result);
1624                 }
1625                 /* We didn't find the correct object, so we need to re-create it
1626                  * via a regular replay. */
1627                 if (!(create_flags & MDS_OPEN_CREAT)) {
1628                         DEBUG_REQ(D_ERROR, req,
1629                                   "OPEN & CREAT not in open replay/by_fid.");
1630                         GOTO(out, result = -EFAULT);
1631                 }
1632                 CDEBUG(D_INFO, "No object(1), continue as regular open.\n");
1633         } else if ((rr->rr_namelen == 0 && create_flags & MDS_OPEN_LOCK) ||
1634                    (create_flags & MDS_OPEN_BY_FID)) {
1635                 result = mdt_open_by_fid_lock(info, ldlm_rep, lhc);
1636                 /* If result is 0 then open by FID has found the file
1637                  * and there is nothing left for us to do here.  More
1638                  * generally if it is anything other than -ENOENT or
1639                  * -EREMOTE then we return that now.  If -ENOENT and
1640                  * MDS_OPEN_CREAT is set then we must create the file
1641                  * below.  If -EREMOTE then we need to return a LOOKUP
1642                  * lock to the client, which we do below.  Hence this
1643                  * odd looking condition.  See LU-2523. */
1644                 if (!(result == -ENOENT && (create_flags & MDS_OPEN_CREAT)) &&
1645                     result != -EREMOTE)
1646                         GOTO(out, result);
1647
1648                 if (unlikely(rr->rr_namelen == 0))
1649                         GOTO(out, result = -EINVAL);
1650
1651                 CDEBUG(D_INFO, "No object(2), continue as regular open.\n");
1652         }
1653
1654         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OPEN_PACK))
1655                 GOTO(out, result = err_serious(-ENOMEM));
1656
1657         mdt_set_disposition(info, ldlm_rep,
1658                             (DISP_IT_EXECD | DISP_LOOKUP_EXECD));
1659
1660         lh = &info->mti_lh[MDT_LH_PARENT];
1661         mdt_lock_pdo_init(lh, (create_flags & MDS_OPEN_CREAT) ?
1662                           LCK_PW : LCK_PR, rr->rr_name, rr->rr_namelen);
1663
1664         parent = mdt_object_find_lock(info, rr->rr_fid1, lh,
1665                                       MDS_INODELOCK_UPDATE);
1666         if (IS_ERR(parent))
1667                 GOTO(out, result = PTR_ERR(parent));
1668
1669         /* get and check version of parent */
1670         result = mdt_version_get_check(info, parent, 0);
1671         if (result)
1672                 GOTO(out_parent, result);
1673
1674         fid_zero(child_fid);
1675
1676         lname = mdt_name(info->mti_env, (char *)rr->rr_name, rr->rr_namelen);
1677         result = mdo_lookup(info->mti_env, mdt_object_child(parent),
1678                             lname, child_fid, &info->mti_spec);
1679         LASSERTF(ergo(result == 0, fid_is_sane(child_fid)),
1680                  "looking for "DFID"/%s, result fid="DFID"\n",
1681                  PFID(mdt_object_fid(parent)), rr->rr_name, PFID(child_fid));
1682
1683         if (result != 0 && result != -ENOENT && result != -ESTALE)
1684                 GOTO(out_parent, result);
1685
1686         if (result == -ENOENT || result == -ESTALE) {
1687                 mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_NEG);
1688                 if (result == -ESTALE) {
1689                         /*
1690                          * -ESTALE means the parent is a dead(unlinked) dir, so
1691                          * it should return -ENOENT to in accordance with the
1692                          * original mds implementaion.
1693                          */
1694                         GOTO(out_parent, result = -ENOENT);
1695                 }
1696                 if (!(create_flags & MDS_OPEN_CREAT))
1697                         GOTO(out_parent, result);
1698                 if (exp_connect_flags(req->rq_export) & OBD_CONNECT_RDONLY)
1699                         GOTO(out_parent, result = -EROFS);
1700                 *child_fid = *info->mti_rr.rr_fid2;
1701                 LASSERTF(fid_is_sane(child_fid), "fid="DFID"\n",
1702                          PFID(child_fid));
1703                 /* In the function below, .hs_keycmp resolves to
1704                  * lu_obj_hop_keycmp() */
1705                 /* coverity[overrun-buffer-val] */
1706                 child = mdt_object_new(info->mti_env, mdt, child_fid);
1707         } else {
1708                 /*
1709                  * Check for O_EXCL is moved to the mdt_finish_open(), we need to
1710                  * return FID back in that case.
1711                  */
1712                 mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_POS);
1713                 child = mdt_object_find(info->mti_env, mdt, child_fid);
1714         }
1715         if (IS_ERR(child))
1716                 GOTO(out_parent, result = PTR_ERR(child));
1717
1718         /** check version of child  */
1719         rc = mdt_version_get_check(info, child, 1);
1720         if (rc)
1721                 GOTO(out_child, result = rc);
1722
1723         mdt_set_capainfo(info, 1, child_fid, BYPASS_CAPA);
1724         if (result == -ENOENT) {
1725                 /* Create under OBF and .lustre is not permitted */
1726                 if (fid_is_obf(rr->rr_fid1) || fid_is_dot_lustre(rr->rr_fid1))
1727                         GOTO(out_child, result = -EPERM);
1728
1729                 /* save versions in reply */
1730                 mdt_version_get_save(info, parent, 0);
1731                 mdt_version_get_save(info, child, 1);
1732
1733                 /* version of child will be changed */
1734                 info->mti_mos = child;
1735
1736                 /* Not found and with MDS_OPEN_CREAT: let's create it. */
1737                 mdt_set_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
1738
1739                 /* Let lower layers know what is lock mode on directory. */
1740                 info->mti_spec.sp_cr_mode =
1741                         mdt_dlm_mode2mdl_mode(lh->mlh_pdo_mode);
1742
1743                 /*
1744                  * Do not perform lookup sanity check. We know that name does
1745                  * not exist.
1746                  */
1747                 info->mti_spec.sp_cr_lookup = 0;
1748                 info->mti_spec.sp_feat = &dt_directory_features;
1749
1750                 result = mdo_create(info->mti_env,
1751                                     mdt_object_child(parent),
1752                                     lname,
1753                                     mdt_object_child(child),
1754                                     &info->mti_spec,
1755                                     &info->mti_attr);
1756                 if (result == -ERESTART) {
1757                         mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
1758                         GOTO(out_child, result);
1759                 } else {
1760
1761                         /* XXX: we should call this once, see few lines below */
1762                         if (result == 0)
1763                                 result = mdt_attr_get_complex(info, child, ma);
1764
1765                         if (result != 0)
1766                                 GOTO(out_child, result);
1767                 }
1768                 created = 1;
1769         } else {
1770                 /*
1771                  * The object is on remote node, return its FID for remote open.
1772                  */
1773                 if (mdt_object_remote(child)) {
1774                         /*
1775                          * Check if this lock already was sent to client and
1776                          * this is resent case. For resent case do not take lock
1777                          * again, use what is already granted.
1778                          */
1779                         LASSERT(lhc != NULL);
1780
1781                         if (lustre_handle_is_used(&lhc->mlh_reg_lh)) {
1782                                 struct ldlm_lock *lock;
1783
1784                                 LASSERT(msg_flags & MSG_RESENT);
1785
1786                                 lock = ldlm_handle2lock(&lhc->mlh_reg_lh);
1787                                 if (!lock) {
1788                                         CERROR("Invalid lock handle "LPX64"\n",
1789                                                lhc->mlh_reg_lh.cookie);
1790                                         LBUG();
1791                                 }
1792                                 LASSERT(fid_res_name_eq(mdt_object_fid(child),
1793                                                         &lock->l_resource->lr_name));
1794                                 LDLM_LOCK_PUT(lock);
1795                                 rc = 0;
1796                         } else {
1797                                 mdt_lock_handle_init(lhc);
1798                                 mdt_lock_reg_init(lhc, LCK_PR);
1799
1800                                 rc = mdt_object_lock(info, child, lhc,
1801                                                      MDS_INODELOCK_LOOKUP,
1802                                                      MDT_CROSS_LOCK);
1803                         }
1804                         repbody->fid1 = *mdt_object_fid(child);
1805                         repbody->valid |= (OBD_MD_FLID | OBD_MD_MDS);
1806                         if (rc != 0)
1807                                 result = rc;
1808                         else
1809                                 result = -EREMOTE;
1810                         GOTO(out_child, result);
1811                 } else {
1812                         if (mdt_object_exists(child)) {
1813                                 /* We have to get attr & LOV EA & HSM for this
1814                                  * object */
1815                                 ma->ma_need |= MA_HSM;
1816                                 result = mdt_attr_get_complex(info, child, ma);
1817                         } else {
1818                                 /*object non-exist!!!*/
1819                                 LBUG();
1820                         }
1821                 }
1822         }
1823
1824         LASSERT(!lustre_handle_is_used(&lhc->mlh_reg_lh));
1825
1826         /* get openlock if this is not replay and if a client requested it */
1827         if (!req_is_replay(req)) {
1828                 rc = mdt_object_open_lock(info, child, lhc, &ibits);
1829                 if (rc != 0)
1830                         GOTO(out_child_unlock, result = rc);
1831                 else if (create_flags & MDS_OPEN_LOCK)
1832                         mdt_set_disposition(info, ldlm_rep, DISP_OPEN_LOCK);
1833         }
1834
1835         /* Try to open it now. */
1836         rc = mdt_finish_open(info, parent, child, create_flags,
1837                              created, ldlm_rep);
1838         if (rc) {
1839                 result = rc;
1840                 /* openlock will be released if mdt_finish_open failed */
1841                 mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_LOCK);
1842
1843                 if (created && create_flags & MDS_OPEN_VOLATILE) {
1844                         CERROR("%s: cannot open volatile file "DFID", orphan "
1845                                "file will be left in PENDING directory until "
1846                                "next reboot, rc = %d\n", mdt_obd_name(mdt),
1847                                PFID(mdt_object_fid(child)), rc);
1848                         GOTO(out_child_unlock, result);
1849                 }
1850
1851                 if (created) {
1852                         ma->ma_need = 0;
1853                         ma->ma_valid = 0;
1854                         ma->ma_cookie_size = 0;
1855                         rc = mdo_unlink(info->mti_env,
1856                                         mdt_object_child(parent),
1857                                         mdt_object_child(child),
1858                                         lname,
1859                                         &info->mti_attr, 0);
1860                         if (rc != 0)
1861                                 CERROR("%s: "DFID" cleanup of open: rc = %d\n",
1862                                        mdt_obd_name(info->mti_mdt),
1863                                        PFID(mdt_object_fid(child)), rc);
1864                         mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
1865                 }
1866         }
1867         EXIT;
1868 out_child_unlock:
1869         mdt_object_open_unlock(info, child, lhc, ibits, result);
1870 out_child:
1871         mdt_object_put(info->mti_env, child);
1872 out_parent:
1873         mdt_object_unlock_put(info, parent, lh, result || !created);
1874 out:
1875         if (result)
1876                 lustre_msg_set_transno(req->rq_repmsg, 0);
1877         return result;
1878 }
1879
1880 /**
1881  * Create an orphan object use local root.
1882  */
1883 static struct mdt_object *mdt_orphan_open(struct mdt_thread_info *info,
1884                                           struct mdt_device *mdt,
1885                                           const struct lu_fid *fid,
1886                                           struct md_attr *attr, fmode_t fmode)
1887 {
1888         const struct lu_env *env = info->mti_env;
1889         struct md_op_spec *spec = &info->mti_spec;
1890         struct lu_fid *local_root_fid = &info->mti_tmp_fid1;
1891         struct mdt_object *obj = NULL;
1892         struct mdt_object *local_root;
1893         static const char name[] = "i_am_nobody";
1894         struct lu_name *lname;
1895         struct lu_ucred *uc;
1896         cfs_cap_t uc_cap_save;
1897         int rc;
1898         ENTRY;
1899
1900         rc = dt_root_get(env, mdt->mdt_bottom, local_root_fid);
1901         if (rc != 0)
1902                 RETURN(ERR_PTR(rc));
1903
1904         local_root = mdt_object_find(env, mdt, local_root_fid);
1905         if (IS_ERR(local_root))
1906                 RETURN(local_root);
1907
1908         obj = mdt_object_new(env, mdt, fid);
1909         if (IS_ERR(obj))
1910                 GOTO(out, rc = PTR_ERR(obj));
1911
1912         spec->sp_cr_lookup = 0;
1913         spec->sp_feat = &dt_directory_features;
1914         spec->sp_cr_mode = MDL_MINMODE; /* no lock */
1915         spec->sp_cr_flags = MDS_OPEN_VOLATILE | fmode;
1916         if (attr->ma_valid & MA_LOV) {
1917                 spec->u.sp_ea.eadata = attr->ma_lmm;
1918                 spec->u.sp_ea.eadatalen = attr->ma_lmm_size;
1919                 spec->sp_cr_flags |= MDS_OPEN_HAS_EA;
1920         } else {
1921                 spec->sp_cr_flags |= MDS_OPEN_DELAY_CREATE;
1922         }
1923
1924         lname = mdt_name(env, (char *)name, sizeof(name) - 1);
1925
1926         uc = lu_ucred(env);
1927         uc_cap_save = uc->uc_cap;
1928         uc->uc_cap |= 1 << CFS_CAP_DAC_OVERRIDE;
1929         rc = mdo_create(env, mdt_object_child(local_root), lname,
1930                         mdt_object_child(obj), spec, attr);
1931         uc->uc_cap = uc_cap_save;
1932         if (rc < 0) {
1933                 CERROR("%s: cannot create volatile file "DFID": rc = %d\n",
1934                        mdt_obd_name(mdt), PFID(fid), rc);
1935                 GOTO(out, rc);
1936         }
1937
1938         rc = mo_open(env, mdt_object_child(obj), MDS_OPEN_CREATED);
1939         if (rc < 0)
1940                 CERROR("%s: cannot open volatile file "DFID", orphan "
1941                        "file will be left in PENDING directory until "
1942                        "next reboot, rc = %d\n", mdt_obd_name(mdt),
1943                        PFID(fid), rc);
1944         GOTO(out, rc);
1945
1946 out:
1947         if (rc < 0) {
1948                 if (!IS_ERR(obj))
1949                         mdt_object_put(env, obj);
1950                 obj = ERR_PTR(rc);
1951         }
1952         mdt_object_put(env, local_root);
1953         return obj;
1954 }
1955
1956 static int mdt_hsm_release(struct mdt_thread_info *info, struct mdt_object *o,
1957                            struct md_attr *ma)
1958 {
1959         struct mdt_lock_handle *lh = &info->mti_lh[MDT_LH_LAYOUT];
1960         struct close_data      *data;
1961         struct ldlm_lock       *lease;
1962         struct mdt_object      *orphan;
1963         struct md_attr         *orp_ma;
1964         struct lu_buf          *buf;
1965         bool                    lease_broken;
1966         int                     rc;
1967         int                     rc2;
1968         ENTRY;
1969
1970         data = req_capsule_client_get(info->mti_pill, &RMF_CLOSE_DATA);
1971         if (data == NULL)
1972                 RETURN(-EPROTO);
1973
1974         lease = ldlm_handle2lock(&data->cd_handle);
1975         if (lease == NULL)
1976                 RETURN(-ESTALE);
1977
1978         /* try to hold open_sem so that nobody else can open the file */
1979         if (!down_write_trylock(&o->mot_open_sem)) {
1980                 ldlm_lock_cancel(lease);
1981                 LDLM_LOCK_PUT(lease);
1982                 RETURN(-EBUSY);
1983         }
1984
1985         /* Check if the lease open lease has already canceled */
1986         lock_res_and_lock(lease);
1987         lease_broken = ldlm_is_cancel(lease);
1988         unlock_res_and_lock(lease);
1989
1990         LDLM_DEBUG(lease, DFID " lease broken? %d\n",
1991                    PFID(mdt_object_fid(o)), lease_broken);
1992
1993         /* Cancel server side lease. Client side counterpart should
1994          * have been cancelled. It's okay to cancel it now as we've
1995          * held mot_open_sem. */
1996         ldlm_lock_cancel(lease);
1997         LDLM_LOCK_PUT(lease);
1998
1999         if (lease_broken) /* don't perform release task */
2000                 GOTO(out_unlock, rc = -ESTALE);
2001
2002         if (fid_is_zero(&data->cd_fid) || !fid_is_sane(&data->cd_fid))
2003                 GOTO(out_unlock, rc = -EINVAL);
2004
2005         /* ma_need was set before but it seems fine to change it in order to
2006          * avoid modifying the one from RPC */
2007         ma->ma_need = MA_HSM;
2008         rc = mdt_attr_get_complex(info, o, ma);
2009         if (rc != 0)
2010                 GOTO(out_unlock, rc);
2011
2012         if (!mdt_hsm_release_allow(ma))
2013                 GOTO(out_unlock, rc = -EPERM);
2014
2015         /* already released? */
2016         if (ma->ma_hsm.mh_flags & HS_RELEASED)
2017                 GOTO(out_unlock, rc = 0);
2018
2019         /* Compare on-disk and packed data_version */
2020         if (data->cd_data_version != ma->ma_hsm.mh_arch_ver) {
2021                 CDEBUG(D_HSM, DFID" data_version mismatches: packed="LPU64
2022                        " and on-disk="LPU64"\n", PFID(mdt_object_fid(o)),
2023                        data->cd_data_version, ma->ma_hsm.mh_arch_ver);
2024                 GOTO(out_unlock, rc = -EPERM);
2025         }
2026
2027         ma->ma_valid = MA_INODE;
2028         ma->ma_attr.la_valid &= LA_ATIME | LA_MTIME | LA_CTIME | LA_SIZE;
2029         rc = mo_attr_set(info->mti_env, mdt_object_child(o), ma);
2030         if (rc < 0)
2031                 GOTO(out_unlock, rc);
2032
2033         ma->ma_need = MA_INODE | MA_LOV;
2034         rc = mdt_attr_get_complex(info, o, ma);
2035         if (rc < 0)
2036                 GOTO(out_unlock, rc);
2037
2038         if (!(ma->ma_valid & MA_LOV)) {
2039                 /* Even empty file are released */
2040                 memset(ma->ma_lmm, 0, sizeof(*ma->ma_lmm));
2041                 ma->ma_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1_DEF);
2042                 ma->ma_lmm->lmm_pattern = cpu_to_le32(LOV_PATTERN_RAID0);
2043                 ma->ma_lmm->lmm_stripe_size = cpu_to_le32(LOV_MIN_STRIPE_SIZE);
2044                 ma->ma_lmm_size = sizeof(*ma->ma_lmm);
2045         } else {
2046                 /* Magic must be LOV_MAGIC_Vx_DEF otherwise LOD will interpret
2047                  * ma_lmm as lov_user_md, then it will be confused by union of
2048                  * layout_gen and stripe_offset. */
2049                 if (le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC_V1)
2050                         ma->ma_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V1_DEF);
2051                 else if (le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC_V3)
2052                         ma->ma_lmm->lmm_magic = cpu_to_le32(LOV_MAGIC_V3_DEF);
2053                 else
2054                         GOTO(out_unlock, rc = -EINVAL);
2055         }
2056
2057         /* Set file as released */
2058         ma->ma_lmm->lmm_pattern |= cpu_to_le32(LOV_PATTERN_F_RELEASED);
2059
2060         /* Hopefully it's not used in this call path */
2061         orp_ma = &info->mti_u.som.attr;
2062         orp_ma->ma_attr.la_mode = S_IFREG | S_IWUSR;
2063         orp_ma->ma_attr.la_uid = ma->ma_attr.la_uid;
2064         orp_ma->ma_attr.la_gid = ma->ma_attr.la_gid;
2065         orp_ma->ma_attr.la_valid = LA_MODE | LA_UID | LA_GID;
2066         orp_ma->ma_lmm = ma->ma_lmm;
2067         orp_ma->ma_lmm_size = ma->ma_lmm_size;
2068         orp_ma->ma_valid = MA_INODE | MA_LOV;
2069         orphan = mdt_orphan_open(info, info->mti_mdt, &data->cd_fid, orp_ma,
2070                                  FMODE_WRITE);
2071         if (IS_ERR(orphan)) {
2072                 CERROR("%s: cannot open orphan file "DFID": rc = %ld\n",
2073                        mdt_obd_name(info->mti_mdt), PFID(&data->cd_fid),
2074                        PTR_ERR(orphan));
2075                 GOTO(out_unlock, rc = PTR_ERR(orphan));
2076         }
2077
2078         /* Set up HSM attribute for orphan object */
2079         CLASSERT(sizeof(struct hsm_attrs) <= sizeof(info->mti_xattr_buf));
2080         buf = &info->mti_buf;
2081         buf->lb_buf = info->mti_xattr_buf;
2082         buf->lb_len = sizeof(struct hsm_attrs);
2083         ma->ma_hsm.mh_flags |= HS_RELEASED;
2084         lustre_hsm2buf(buf->lb_buf, &ma->ma_hsm);
2085         ma->ma_hsm.mh_flags &= ~HS_RELEASED;
2086         rc = mo_xattr_set(info->mti_env, mdt_object_child(orphan), buf,
2087                           XATTR_NAME_HSM, 0);
2088         if (rc < 0)
2089                 GOTO(out_close, rc);
2090
2091         mdt_lock_reg_init(lh, LCK_EX);
2092         rc = mdt_object_lock(info, o, lh, MDS_INODELOCK_LAYOUT, MDT_LOCAL_LOCK);
2093         if (rc == 0) {
2094                 /* Swap layout with orphan object */
2095                 rc = mo_swap_layouts(info->mti_env, mdt_object_child(o),
2096                                      mdt_object_child(orphan),
2097                                      SWAP_LAYOUTS_MDS_HSM);
2098
2099                 /* Release exclusive LL */
2100                 mdt_object_unlock(info, o, lh, 1);
2101         }
2102         EXIT;
2103
2104 out_close:
2105         /* Close orphan object anyway */
2106         rc2 = mo_close(info->mti_env, mdt_object_child(orphan), orp_ma,
2107                        FMODE_WRITE);
2108         if (rc2 < 0)
2109                 CERROR("%s: error closing volatile file "DFID": rc = %d\n",
2110                        mdt_obd_name(info->mti_mdt), PFID(&data->cd_fid), rc2);
2111         LU_OBJECT_DEBUG(D_HSM, info->mti_env, &orphan->mot_obj,
2112                         "object closed\n");
2113         mdt_object_put(info->mti_env, orphan);
2114
2115 out_unlock:
2116         up_write(&o->mot_open_sem);
2117
2118         if (rc == 0) { /* already released */
2119                 struct mdt_body *repbody;
2120                 repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
2121                 LASSERT(repbody != NULL);
2122                 repbody->valid |= OBD_MD_FLRELEASED;
2123         }
2124
2125         ma->ma_valid = 0;
2126         ma->ma_need = 0;
2127
2128         return rc;
2129 }
2130
2131 #define MFD_CLOSED(mode) (((mode) & ~(MDS_FMODE_EPOCH | MDS_FMODE_SOM | \
2132                                       MDS_FMODE_TRUNC)) == MDS_FMODE_CLOSED)
2133
2134 static int mdt_mfd_closed(struct mdt_file_data *mfd)
2135 {
2136         return ((mfd == NULL) || MFD_CLOSED(mfd->mfd_mode));
2137 }
2138
2139 int mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd)
2140 {
2141         struct mdt_object *o = mfd->mfd_object;
2142         struct md_object *next = mdt_object_child(o);
2143         struct md_attr *ma = &info->mti_attr;
2144         int ret = MDT_IOEPOCH_CLOSED;
2145         int rc = 0;
2146         __u64 mode;
2147         ENTRY;
2148
2149         mode = mfd->mfd_mode;
2150
2151         if (ma->ma_attr_flags & MDS_HSM_RELEASE) {
2152                 rc = mdt_hsm_release(info, o, ma);
2153                 if (rc < 0) {
2154                         CDEBUG(D_HSM, "%s: File " DFID " release failed: %d\n",
2155                                 mdt_obd_name(info->mti_mdt),
2156                                 PFID(mdt_object_fid(o)), rc);
2157                         /* continue to close even error occurred. */
2158                 }
2159         }
2160
2161         if ((mode & FMODE_WRITE) || (mode & MDS_FMODE_TRUNC)) {
2162                 mdt_write_put(o);
2163                 ret = mdt_ioepoch_close(info, o);
2164         } else if (mode & MDS_FMODE_EXEC) {
2165                 mdt_write_allow(o);
2166         } else if (mode & MDS_FMODE_EPOCH) {
2167                 ret = mdt_ioepoch_close(info, o);
2168         } else if (mode & MDS_FMODE_SOM) {
2169                 ret = mdt_som_au_close(info, o);
2170         }
2171
2172         /* Update atime on close only. */
2173         if ((mode & MDS_FMODE_EXEC || mode & FMODE_READ || mode & FMODE_WRITE)
2174             && (ma->ma_valid & MA_INODE) && (ma->ma_attr.la_valid & LA_ATIME)) {
2175                 /* Set the atime only. */
2176                 ma->ma_valid = MA_INODE;
2177                 ma->ma_attr.la_valid = LA_ATIME;
2178                 rc = mo_attr_set(info->mti_env, next, ma);
2179         }
2180
2181         /* If file data is modified, add the dirty flag. */
2182         if (ma->ma_attr_flags & MDS_DATA_MODIFIED)
2183                 rc = mdt_add_dirty_flag(info, o, ma);
2184
2185         ma->ma_need |= MA_INODE;
2186         ma->ma_valid &= ~MA_INODE;
2187
2188         if (!MFD_CLOSED(mode))
2189                 rc = mo_close(info->mti_env, next, ma, mode);
2190
2191         if (ret == MDT_IOEPOCH_GETATTR || ret == MDT_IOEPOCH_OPENED) {
2192                 struct mdt_export_data *med;
2193
2194                 /* The IOepoch is still opened or SOM update is needed.
2195                  * Put mfd back into the list. */
2196                 LASSERT(mdt_conn_flags(info) & OBD_CONNECT_SOM);
2197                 mdt_mfd_set_mode(mfd, ret == MDT_IOEPOCH_OPENED ?
2198                                       MDS_FMODE_EPOCH : MDS_FMODE_SOM);
2199
2200                 LASSERT(mdt_info_req(info));
2201                 med = &mdt_info_req(info)->rq_export->exp_mdt_data;
2202                 spin_lock(&med->med_open_lock);
2203                 cfs_list_add(&mfd->mfd_list, &med->med_open_head);
2204                 class_handle_hash_back(&mfd->mfd_handle);
2205                 spin_unlock(&med->med_open_lock);
2206
2207                 if (ret == MDT_IOEPOCH_OPENED) {
2208                         ret = 0;
2209                 } else {
2210                         ret = -EAGAIN;
2211                         CDEBUG(D_INODE, "Size-on-MDS attribute update is "
2212                                "needed on "DFID"\n", PFID(mdt_object_fid(o)));
2213                 }
2214         } else {
2215                 /* adjust open and lease count */
2216                 if (mode & MDS_OPEN_LEASE) {
2217                         LASSERT(atomic_read(&o->mot_lease_count) > 0);
2218                         atomic_dec(&o->mot_lease_count);
2219                 }
2220                 LASSERT(atomic_read(&o->mot_open_count) > 0);
2221                 atomic_dec(&o->mot_open_count);
2222
2223                 mdt_mfd_free(mfd);
2224                 mdt_object_put(info->mti_env, o);
2225         }
2226
2227         RETURN(rc ? rc : ret);
2228 }
2229
2230 int mdt_close(struct mdt_thread_info *info)
2231 {
2232         struct mdt_export_data *med;
2233         struct mdt_file_data   *mfd;
2234         struct mdt_object      *o;
2235         struct md_attr         *ma = &info->mti_attr;
2236         struct mdt_body        *repbody = NULL;
2237         struct ptlrpc_request  *req = mdt_info_req(info);
2238         int rc, ret = 0;
2239         ENTRY;
2240
2241         mdt_counter_incr(req, LPROC_MDT_CLOSE);
2242         /* Close may come with the Size-on-MDS update. Unpack it. */
2243         rc = mdt_close_unpack(info);
2244         if (rc)
2245                 RETURN(err_serious(rc));
2246
2247         LASSERT(info->mti_ioepoch);
2248
2249         req_capsule_set_size(info->mti_pill, &RMF_MDT_MD, RCL_SERVER,
2250                              info->mti_mdt->mdt_max_mdsize);
2251         req_capsule_set_size(info->mti_pill, &RMF_LOGCOOKIES, RCL_SERVER,
2252                              info->mti_mdt->mdt_max_cookiesize);
2253         rc = req_capsule_server_pack(info->mti_pill);
2254         if (mdt_check_resent(info, mdt_reconstruct_generic, NULL)) {
2255                 mdt_client_compatibility(info);
2256                 if (rc == 0)
2257                         mdt_fix_reply(info);
2258                 mdt_exit_ucred(info);
2259                 RETURN(lustre_msg_get_status(req->rq_repmsg));
2260         }
2261
2262         /* Continue to close handle even if we can not pack reply */
2263         if (rc == 0) {
2264                 repbody = req_capsule_server_get(info->mti_pill,
2265                                                  &RMF_MDT_BODY);
2266                 ma->ma_lmm = req_capsule_server_get(info->mti_pill,
2267                                                     &RMF_MDT_MD);
2268                 ma->ma_lmm_size = req_capsule_get_size(info->mti_pill,
2269                                                        &RMF_MDT_MD,
2270                                                        RCL_SERVER);
2271                 ma->ma_cookie = req_capsule_server_get(info->mti_pill,
2272                                                        &RMF_LOGCOOKIES);
2273                 ma->ma_cookie_size = req_capsule_get_size(info->mti_pill,
2274                                                           &RMF_LOGCOOKIES,
2275                                                           RCL_SERVER);
2276                 ma->ma_need = MA_INODE | MA_LOV | MA_COOKIE;
2277                 repbody->eadatasize = 0;
2278                 repbody->aclsize = 0;
2279         } else {
2280                 rc = err_serious(rc);
2281         }
2282
2283         med = &req->rq_export->exp_mdt_data;
2284         spin_lock(&med->med_open_lock);
2285         mfd = mdt_handle2mfd(med, &info->mti_ioepoch->handle,
2286                              req_is_replay(req));
2287         if (mdt_mfd_closed(mfd)) {
2288                 spin_unlock(&med->med_open_lock);
2289                 CDEBUG(D_INODE, "no handle for file close: fid = "DFID
2290                        ": cookie = "LPX64"\n", PFID(info->mti_rr.rr_fid1),
2291                        info->mti_ioepoch->handle.cookie);
2292                 /** not serious error since bug 3633 */
2293                 rc = -ESTALE;
2294         } else {
2295                 class_handle_unhash(&mfd->mfd_handle);
2296                 cfs_list_del_init(&mfd->mfd_list);
2297                 spin_unlock(&med->med_open_lock);
2298
2299                 /* Do not lose object before last unlink. */
2300                 o = mfd->mfd_object;
2301                 mdt_object_get(info->mti_env, o);
2302                 ret = mdt_mfd_close(info, mfd);
2303                 if (repbody != NULL)
2304                         rc = mdt_handle_last_unlink(info, o, ma);
2305                 mdt_empty_transno(info, rc);
2306                 mdt_object_put(info->mti_env, o);
2307         }
2308         if (repbody != NULL) {
2309                 mdt_client_compatibility(info);
2310                 rc = mdt_fix_reply(info);
2311         }
2312
2313         mdt_exit_ucred(info);
2314         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CLOSE_PACK))
2315                 RETURN(err_serious(-ENOMEM));
2316
2317         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_CLOSE_NET_REP,
2318                                  OBD_FAIL_MDS_CLOSE_NET_REP))
2319                 info->mti_fail_id = OBD_FAIL_MDS_CLOSE_NET_REP;
2320         RETURN(rc ? rc : ret);
2321 }
2322
2323 /**
2324  * DONE_WRITING rpc handler.
2325  *
2326  * As mfd is not kept after replayed CLOSE (see mdt_ioepoch_close_on_replay()),
2327  * only those DONE_WRITING rpc will be replayed which really wrote smth on disk,
2328  * and got a trasid. Waiting for such DONE_WRITING is not reliable, so just
2329  * skip attributes and reconstruct the reply here.
2330  */
2331 int mdt_done_writing(struct mdt_thread_info *info)
2332 {
2333         struct ptlrpc_request   *req = mdt_info_req(info);
2334         struct mdt_body         *repbody = NULL;
2335         struct mdt_export_data  *med;
2336         struct mdt_file_data    *mfd;
2337         int rc;
2338         ENTRY;
2339
2340         rc = req_capsule_server_pack(info->mti_pill);
2341         if (rc)
2342                 RETURN(err_serious(rc));
2343
2344         repbody = req_capsule_server_get(info->mti_pill,
2345                                          &RMF_MDT_BODY);
2346         repbody->eadatasize = 0;
2347         repbody->aclsize = 0;
2348
2349         /* Done Writing may come with the Size-on-MDS update. Unpack it. */
2350         rc = mdt_close_unpack(info);
2351         if (rc)
2352                 RETURN(err_serious(rc));
2353
2354         if (mdt_check_resent(info, mdt_reconstruct_generic, NULL)) {
2355                 mdt_exit_ucred(info);
2356                 RETURN(lustre_msg_get_status(req->rq_repmsg));
2357         }
2358
2359         med = &info->mti_exp->exp_mdt_data;
2360         spin_lock(&med->med_open_lock);
2361         mfd = mdt_handle2mfd(med, &info->mti_ioepoch->handle,
2362                              req_is_replay(req));
2363         if (mfd == NULL) {
2364                 spin_unlock(&med->med_open_lock);
2365                 CDEBUG(D_INODE, "no handle for done write: fid = "DFID
2366                        ": cookie = "LPX64" ioepoch = "LPU64"\n",
2367                        PFID(info->mti_rr.rr_fid1),
2368                        info->mti_ioepoch->handle.cookie,
2369                        info->mti_ioepoch->ioepoch);
2370                 /* If this is a replay, reconstruct the transno. */
2371                 if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
2372                         rc = info->mti_ioepoch->flags & MF_SOM_AU ?
2373                              -EAGAIN : 0;
2374                         mdt_empty_transno(info, rc);
2375                 } else
2376                         rc = -ESTALE;
2377                 GOTO(error_ucred, rc);
2378         }
2379
2380         LASSERT(mfd->mfd_mode == MDS_FMODE_EPOCH ||
2381                 mfd->mfd_mode == MDS_FMODE_TRUNC);
2382         class_handle_unhash(&mfd->mfd_handle);
2383         cfs_list_del_init(&mfd->mfd_list);
2384         spin_unlock(&med->med_open_lock);
2385
2386         /* Set EPOCH CLOSE flag if not set by client. */
2387         info->mti_ioepoch->flags |= MF_EPOCH_CLOSE;
2388         info->mti_attr.ma_valid = 0;
2389
2390         info->mti_attr.ma_lmm_size = info->mti_mdt->mdt_max_mdsize;
2391         OBD_ALLOC_LARGE(info->mti_attr.ma_lmm, info->mti_mdt->mdt_max_mdsize);
2392         if (info->mti_attr.ma_lmm == NULL)
2393                 GOTO(error_ucred, rc = -ENOMEM);
2394
2395         rc = mdt_mfd_close(info, mfd);
2396
2397         OBD_FREE_LARGE(info->mti_attr.ma_lmm, info->mti_mdt->mdt_max_mdsize);
2398         mdt_empty_transno(info, rc);
2399 error_ucred:
2400         mdt_exit_ucred(info);
2401         RETURN(rc);
2402 }