Whamcloud - gitweb
LU-3233 mdt: validate open handle cookies
[fs/lustre-release.git] / lustre / mdt / mdt_open.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2013, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/mdt/mdt_open.c
37  *
38  * Lustre Metadata Target (mdt) open/close file handling
39  *
40  * Author: Huang Hua <huanghua@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_MDS
44
45 #include <lustre_acl.h>
46 #include <lustre_mds.h>
47 #include "mdt_internal.h"
48
49 /* we do nothing because we do not have refcount now */
50 static void mdt_mfd_get(void *mfdp)
51 {
52 }
53
54 static struct portals_handle_ops mfd_handle_ops = {
55         .hop_addref = mdt_mfd_get,
56         .hop_free   = NULL,
57 };
58
59 /* Create a new mdt_file_data struct, initialize it,
60  * and insert it to global hash table */
61 struct mdt_file_data *mdt_mfd_new(const struct mdt_export_data *med)
62 {
63         struct mdt_file_data *mfd;
64         ENTRY;
65
66         OBD_ALLOC_PTR(mfd);
67         if (mfd != NULL) {
68                 CFS_INIT_LIST_HEAD(&mfd->mfd_handle.h_link);
69                 mfd->mfd_handle.h_owner = med;
70                 CFS_INIT_LIST_HEAD(&mfd->mfd_list);
71                 class_handle_hash(&mfd->mfd_handle, &mfd_handle_ops);
72         }
73
74         RETURN(mfd);
75 }
76
77 /*
78  * Find the mfd pointed to by handle in global hash table.
79  * In case of replay the handle is obsoleted
80  * but mfd can be found in mfd list by that handle
81  */
82 struct mdt_file_data *mdt_handle2mfd(struct mdt_export_data *med,
83                                      const struct lustre_handle *handle,
84                                      bool is_replay)
85 {
86         struct mdt_file_data   *mfd;
87         ENTRY;
88
89         LASSERT(handle != NULL);
90         mfd = class_handle2object(handle->cookie, med);
91         /* during dw/setattr replay the mfd can be found by old handle */
92         if (mfd == NULL && is_replay) {
93                 cfs_list_for_each_entry(mfd, &med->med_open_head, mfd_list) {
94                         if (mfd->mfd_old_handle.cookie == handle->cookie)
95                                 RETURN(mfd);
96                 }
97                 mfd = NULL;
98         }
99
100         RETURN(mfd);
101 }
102
103 /* free mfd */
104 void mdt_mfd_free(struct mdt_file_data *mfd)
105 {
106         LASSERT(cfs_list_empty(&mfd->mfd_list));
107         OBD_FREE_RCU(mfd, sizeof *mfd, &mfd->mfd_handle);
108 }
109
110 static int mdt_create_data(struct mdt_thread_info *info,
111                            struct mdt_object *p, struct mdt_object *o)
112 {
113         struct md_op_spec     *spec = &info->mti_spec;
114         struct md_attr        *ma   = &info->mti_attr;
115         int                    rc   = 0;
116         ENTRY;
117
118         if (!md_should_create(spec->sp_cr_flags))
119                 RETURN(0);
120
121         ma->ma_need = MA_INODE | MA_LOV;
122         ma->ma_valid = 0;
123         mutex_lock(&o->mot_lov_mutex);
124         if (!(o->mot_flags & MOF_LOV_CREATED)) {
125                 if (p != NULL && (fid_is_obf(mdt_object_fid(p)) ||
126                                   fid_is_dot_lustre(mdt_object_fid(p))))
127                         GOTO(unlock, rc = -EPERM);
128
129                 rc = mdo_create_data(info->mti_env,
130                                      p ? mdt_object_child(p) : NULL,
131                                      mdt_object_child(o), spec, ma);
132                 if (rc == 0)
133                         rc = mdt_attr_get_complex(info, o, ma);
134
135                 if (rc == 0 && ma->ma_valid & MA_LOV)
136                         o->mot_flags |= MOF_LOV_CREATED;
137         }
138 unlock:
139         mutex_unlock(&o->mot_lov_mutex);
140         RETURN(rc);
141 }
142
143 static int mdt_ioepoch_opened(struct mdt_object *mo)
144 {
145         return mo->mot_ioepoch_count;
146 }
147
148 int mdt_object_is_som_enabled(struct mdt_object *mo)
149 {
150         return !mo->mot_ioepoch;
151 }
152
153 /**
154  * Re-enable Size-on-MDS.
155  * Call under ->mot_ioepoch_mutex.
156  */
157 static void mdt_object_som_enable(struct mdt_object *mo, __u64 ioepoch)
158 {
159         if (ioepoch == mo->mot_ioepoch) {
160                 LASSERT(!mdt_ioepoch_opened(mo));
161                 mo->mot_ioepoch = 0;
162                 mo->mot_flags = 0;
163         }
164 }
165
166 /**
167  * Open the IOEpoch. It is allowed if @writecount is not negative.
168  * The epoch and writecount handling is performed under the mot_ioepoch_mutex.
169  */
170 int mdt_ioepoch_open(struct mdt_thread_info *info, struct mdt_object *o,
171                      int created)
172 {
173         struct mdt_device *mdt = info->mti_mdt;
174         int cancel = 0;
175         int rc = 0;
176         ENTRY;
177
178         if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) ||
179             !S_ISREG(lu_object_attr(&o->mot_obj)))
180                 RETURN(0);
181
182         mutex_lock(&o->mot_ioepoch_mutex);
183         if (mdt_ioepoch_opened(o)) {
184                 /* Epoch continues even if there is no writers yet. */
185                 CDEBUG(D_INODE, "continue epoch "LPU64" for "DFID"\n",
186                        o->mot_ioepoch, PFID(mdt_object_fid(o)));
187         } else {
188                 /* XXX: ->mdt_ioepoch is not initialized at the mount */
189                 spin_lock(&mdt->mdt_ioepoch_lock);
190                 if (mdt->mdt_ioepoch < info->mti_replayepoch)
191                         mdt->mdt_ioepoch = info->mti_replayepoch;
192
193                 if (info->mti_replayepoch)
194                         o->mot_ioepoch = info->mti_replayepoch;
195                 else if (++mdt->mdt_ioepoch == IOEPOCH_INVAL)
196                         o->mot_ioepoch = ++mdt->mdt_ioepoch;
197                 else
198                         o->mot_ioepoch = mdt->mdt_ioepoch;
199
200                 spin_unlock(&mdt->mdt_ioepoch_lock);
201
202                 CDEBUG(D_INODE, "starting epoch "LPU64" for "DFID"\n",
203                        o->mot_ioepoch, PFID(mdt_object_fid(o)));
204                 if (created)
205                         o->mot_flags |= MOF_SOM_CREATED;
206                 cancel = 1;
207         }
208         o->mot_ioepoch_count++;
209         mutex_unlock(&o->mot_ioepoch_mutex);
210
211         /* Cancel Size-on-MDS attributes cached on clients for the open case.
212          * In the truncate case, see mdt_reint_setattr(). */
213         if (cancel && (info->mti_rr.rr_fid1 != NULL)) {
214                 struct mdt_lock_handle  *lh = &info->mti_lh[MDT_LH_CHILD];
215                 mdt_lock_reg_init(lh, LCK_EX);
216                 rc = mdt_object_lock(info, o, lh, MDS_INODELOCK_UPDATE,
217                                      MDT_LOCAL_LOCK);
218                 if (rc == 0)
219                         mdt_object_unlock(info, o, lh, 1);
220         }
221         RETURN(rc);
222 }
223
224 /**
225  * Update SOM on-disk attributes.
226  * If enabling, write update inodes and lustre-ea with the proper IOEpoch,
227  * mountid and attributes. If disabling, clean SOM xattr.
228  * Call under ->mot_ioepoch_mutex.
229  */
230 static int mdt_som_attr_set(struct mdt_thread_info *info,
231                             struct mdt_object *obj, __u64 ioepoch, bool enable)
232 {
233         struct md_object        *next = mdt_object_child(obj);
234         int                      rc;
235         ENTRY;
236
237         CDEBUG(D_INODE, "Size-on-MDS attribute %s for epoch "LPU64
238                " on "DFID".\n", enable ? "update" : "disabling",
239                ioepoch, PFID(mdt_object_fid(obj)));
240
241         if (enable) {
242                 struct lu_buf           *buf = &info->mti_buf;
243                 struct som_attrs        *attrs;
244                 struct md_attr          *ma = &info->mti_attr;
245                 struct lu_attr          *la = &ma->ma_attr;
246                 struct obd_device       *obd = info->mti_mdt->mdt_lut.lut_obd;
247
248                 attrs = (struct som_attrs *)info->mti_xattr_buf;
249                 CLASSERT(sizeof(info->mti_xattr_buf) >= sizeof(*attrs));
250
251                 /* pack SOM attributes */
252                 memset(attrs, 0, sizeof(*attrs));
253                 attrs->som_ioepoch = ioepoch;
254                 attrs->som_mountid = obd->u.obt.obt_mount_count;
255                 if ((la->la_valid & LA_SIZE) != 0)
256                         attrs->som_size = la->la_size;
257                 if ((la->la_valid & LA_BLOCKS) != 0)
258                         attrs->som_blocks = la->la_blocks;
259                 lustre_som_swab(attrs);
260
261                 /* update SOM attributes */
262                 buf->lb_buf = attrs;
263                 buf->lb_len = sizeof(*attrs);
264                 rc = mo_xattr_set(info->mti_env, next, buf, XATTR_NAME_SOM, 0);
265         } else {
266                 /* delete SOM attributes */
267                 rc = mo_xattr_del(info->mti_env, next, XATTR_NAME_SOM);
268         }
269
270         RETURN(rc);
271 }
272
273 /** Perform the eviction specific actions on ioepoch close. */
274 static inline int mdt_ioepoch_close_on_eviction(struct mdt_thread_info *info,
275                                                 struct mdt_object *o)
276 {
277         int rc = 0;
278
279         mutex_lock(&o->mot_ioepoch_mutex);
280         CDEBUG(D_INODE, "Eviction. Closing IOepoch "LPU64" on "DFID". "
281                "Count %d\n", o->mot_ioepoch, PFID(mdt_object_fid(o)),
282                o->mot_ioepoch_count);
283         o->mot_ioepoch_count--;
284
285         /* If eviction occured set MOF_SOM_RECOV,
286          * if no other epoch holders, disable SOM on disk. */
287         o->mot_flags |= MOF_SOM_CHANGE | MOF_SOM_RECOV;
288         if (!mdt_ioepoch_opened(o)) {
289                 rc = mdt_som_attr_set(info, o, o->mot_ioepoch, MDT_SOM_DISABLE);
290                 mdt_object_som_enable(o, o->mot_ioepoch);
291         }
292         mutex_unlock(&o->mot_ioepoch_mutex);
293         RETURN(rc);
294 }
295
296 /**
297  * Perform the replay specific actions on ioepoch close.
298  * Skip SOM attribute update if obtained and just forget about the inode state
299  * for the last ioepoch holder. The SOM cache is invalidated on MDS failure.
300  */
301 static inline int mdt_ioepoch_close_on_replay(struct mdt_thread_info *info,
302                                               struct mdt_object *o)
303 {
304         int rc = MDT_IOEPOCH_CLOSED;
305         ENTRY;
306
307         mutex_lock(&o->mot_ioepoch_mutex);
308         CDEBUG(D_INODE, "Replay. Closing epoch "LPU64" on "DFID". Count %d\n",
309                o->mot_ioepoch, PFID(mdt_object_fid(o)), o->mot_ioepoch_count);
310         o->mot_ioepoch_count--;
311
312         /* Get an info from the replayed request if client is supposed
313          * to send an Attibute Update, reconstruct @rc if so */
314         if (info->mti_ioepoch->flags & MF_SOM_AU)
315                 rc = MDT_IOEPOCH_GETATTR;
316
317         if (!mdt_ioepoch_opened(o))
318                 mdt_object_som_enable(o, info->mti_ioepoch->ioepoch);
319         mutex_unlock(&o->mot_ioepoch_mutex);
320
321         RETURN(rc);
322 }
323
324 /**
325  * Regular file IOepoch close.
326  * Closes the ioepoch, checks the object state, apply obtained attributes and
327  * re-enable SOM on the object, if possible. Also checks if the recovery is
328  * needed and packs OBD_MD_FLGETATTRLOCK flag into the reply to force the client
329  * to obtain SOM attributes under the server-side OST locks.
330  *
331  * Return value:
332  * MDT_IOEPOCH_CLOSED if ioepoch is closed.
333  * MDT_IOEPOCH_GETATTR if ioepoch is closed but another SOM update is needed.
334  */
335 static inline int mdt_ioepoch_close_reg(struct mdt_thread_info *info,
336                                         struct mdt_object *o)
337 {
338         struct md_attr *tmp_ma;
339         struct lu_attr *la;
340         int achange, opened;
341         int recovery = 0;
342         int rc = 0, ret = MDT_IOEPOCH_CLOSED;
343         ENTRY;
344
345         la = &info->mti_attr.ma_attr;
346         achange = (info->mti_ioepoch->flags & MF_SOM_CHANGE);
347
348         mutex_lock(&o->mot_ioepoch_mutex);
349         o->mot_ioepoch_count--;
350
351         tmp_ma = &info->mti_u.som.attr;
352         tmp_ma->ma_lmm = info->mti_attr.ma_lmm;
353         tmp_ma->ma_lmm_size = info->mti_attr.ma_lmm_size;
354         tmp_ma->ma_som = &info->mti_u.som.data;
355         tmp_ma->ma_need = MA_INODE | MA_LOV | MA_SOM;
356         tmp_ma->ma_valid = 0;
357         rc = mdt_attr_get_complex(info, o, tmp_ma);
358         if (rc)
359                 GOTO(error_up, rc);
360
361         /* Check the on-disk SOM state. */
362         if (o->mot_flags & MOF_SOM_RECOV)
363                 recovery = 1;
364         else if (!(o->mot_flags & MOF_SOM_CREATED) &&
365                  !(tmp_ma->ma_valid & MA_SOM))
366                 recovery = 1;
367
368         CDEBUG(D_INODE, "Closing epoch "LPU64" on "DFID". Count %d\n",
369                o->mot_ioepoch, PFID(mdt_object_fid(o)), o->mot_ioepoch_count);
370
371         opened = mdt_ioepoch_opened(o);
372         /**
373          * If IOEpoch is not opened, check if a Size-on-MDS update is needed.
374          * Skip the check for file with no LOV  or for unlink files.
375          */
376         if (!opened && tmp_ma->ma_valid & MA_LOV &&
377             !(tmp_ma->ma_valid & MA_INODE && tmp_ma->ma_attr.la_nlink == 0)) {
378                 if (recovery)
379                         /* If some previous writer was evicted, re-ask the
380                          * client for attributes. Even if attributes are
381                          * provided, we cannot believe in them.
382                          * Another use case is that there is no SOM cache on
383                          * disk -- first access with SOM or there was an MDS
384                          * failure. */
385                         ret = MDT_IOEPOCH_GETATTR;
386                 else if (o->mot_flags & MOF_SOM_CHANGE)
387                         /* Some previous writer changed the attribute.
388                          * Do not believe to the current Size-on-MDS
389                          * update, re-ask client. */
390                         ret = MDT_IOEPOCH_GETATTR;
391                 else if (!(la->la_valid & LA_SIZE) && achange)
392                         /* Attributes were changed by the last writer
393                          * only but no Size-on-MDS update is received.*/
394                         ret = MDT_IOEPOCH_GETATTR;
395         }
396
397         if (achange || ret == MDT_IOEPOCH_GETATTR)
398                 o->mot_flags |= MOF_SOM_CHANGE;
399
400         /* If epoch ends and relable SOM attributes are obtained, update them.
401          * Create SOM ea for new files even if there is no attributes obtained
402          * (0-length file). */
403         if (ret == MDT_IOEPOCH_CLOSED && !opened) {
404                 if (achange || o->mot_flags & MOF_SOM_CREATED) {
405                         LASSERT(achange || !(la->la_valid & LA_SIZE));
406                         rc = mdt_som_attr_set(info, o, o->mot_ioepoch,
407                                               MDT_SOM_ENABLE);
408                         /* Avoid the following setattrs of these attributes,
409                          * e.g. for atime update. */
410                         info->mti_attr.ma_valid = 0;
411                 }
412                 mdt_object_som_enable(o, o->mot_ioepoch);
413         }
414
415         mutex_unlock(&o->mot_ioepoch_mutex);
416         /* If recovery is needed, tell the client to perform GETATTR under
417          * the lock. */
418         if (ret == MDT_IOEPOCH_GETATTR && recovery) {
419                 struct mdt_body *rep;
420                 rep = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
421                 rep->valid |= OBD_MD_FLGETATTRLOCK;
422         }
423
424         RETURN(rc ? : ret);
425
426 error_up:
427         mutex_unlock(&o->mot_ioepoch_mutex);
428         return rc;
429 }
430
431 /**
432  * Close IOEpoch (opened file or MDS_FMODE_EPOCH state). It happens if:
433  * - a client closes the IOEpoch;
434  * - a client eviction occured.
435  * Return values:
436  * MDT_IOEPOCH_OPENED if the client does not close IOEpoch.
437  * MDT_IOEPOCH_CLOSED if the client closes IOEpoch.
438  * MDT_IOEPOCH_GETATTR if the client closes IOEpoch but another SOM attribute
439  * update is needed.
440  */
441 static int mdt_ioepoch_close(struct mdt_thread_info *info, struct mdt_object *o)
442 {
443         struct ptlrpc_request *req = mdt_info_req(info);
444         ENTRY;
445
446         if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) ||
447             !S_ISREG(lu_object_attr(&o->mot_obj)))
448                 RETURN(0);
449
450         LASSERT(o->mot_ioepoch_count);
451         LASSERT(info->mti_ioepoch == NULL ||
452                 info->mti_ioepoch->ioepoch == o->mot_ioepoch);
453
454         /* IOEpoch is closed only if client tells about it or eviction occures.
455          * In the replay case, always close the epoch. */
456         if (req == NULL)
457                 RETURN(mdt_ioepoch_close_on_eviction(info, o));
458         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
459                 RETURN(mdt_ioepoch_close_on_replay(info, o));
460         if (info->mti_ioepoch->flags & MF_EPOCH_CLOSE)
461                 RETURN(mdt_ioepoch_close_reg(info, o));
462         /* IO epoch is not closed. */
463         RETURN(MDT_IOEPOCH_OPENED);
464 }
465
466 /**
467  * Close MDS_FMODE_SOM state, when IOEpoch is already closed and we are waiting
468  * for attribute update. It happens if:
469  * - SOM Attribute Update is obtained;
470  * - the client failed to obtain it and informs MDS about it;
471  * - a client eviction occured.
472  * Apply obtained attributes for the 1st case, wipe out the on-disk SOM
473  * cache otherwise.
474  */
475 int mdt_som_au_close(struct mdt_thread_info *info, struct mdt_object *o)
476 {
477         struct ptlrpc_request   *req = mdt_info_req(info);
478         __u64                    ioepoch = 0;
479         int                      act = MDT_SOM_ENABLE;
480         int                      rc = 0;
481         ENTRY;
482
483         LASSERT(!req || info->mti_ioepoch);
484         if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) ||
485             !S_ISREG(lu_object_attr(&o->mot_obj)))
486                 RETURN(0);
487
488         /* No size whereas MF_SOM_CHANGE is set means client failed to
489          * obtain ost attributes, drop the SOM cache on disk if so. */
490         if (!req ||
491             (info->mti_ioepoch &&
492              info->mti_ioepoch->flags & MF_SOM_CHANGE &&
493              !(info->mti_attr.ma_attr.la_valid & LA_SIZE)))
494                 act = MDT_SOM_DISABLE;
495
496         mutex_lock(&o->mot_ioepoch_mutex);
497         /* Mark the object it is the recovery state if we failed to obtain
498          * SOM attributes. */
499         if (act == MDT_SOM_DISABLE)
500                 o->mot_flags |= MOF_SOM_RECOV;
501
502         if (!mdt_ioepoch_opened(o)) {
503                 ioepoch =  info->mti_ioepoch ?
504                         info->mti_ioepoch->ioepoch : o->mot_ioepoch;
505
506                 if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY))
507                         rc = mdt_som_attr_set(info, o, ioepoch, act);
508                 mdt_object_som_enable(o, ioepoch);
509         }
510         mutex_unlock(&o->mot_ioepoch_mutex);
511         RETURN(rc);
512 }
513
514 int mdt_write_read(struct mdt_object *o)
515 {
516         int rc = 0;
517         ENTRY;
518         mutex_lock(&o->mot_ioepoch_mutex);
519         rc = o->mot_writecount;
520         mutex_unlock(&o->mot_ioepoch_mutex);
521         RETURN(rc);
522 }
523
524 int mdt_write_get(struct mdt_object *o)
525 {
526         int rc = 0;
527         ENTRY;
528         mutex_lock(&o->mot_ioepoch_mutex);
529         if (o->mot_writecount < 0)
530                 rc = -ETXTBSY;
531         else
532                 o->mot_writecount++;
533         mutex_unlock(&o->mot_ioepoch_mutex);
534         RETURN(rc);
535 }
536
537 void mdt_write_put(struct mdt_object *o)
538 {
539         ENTRY;
540         mutex_lock(&o->mot_ioepoch_mutex);
541         o->mot_writecount--;
542         mutex_unlock(&o->mot_ioepoch_mutex);
543         EXIT;
544 }
545
546 static int mdt_write_deny(struct mdt_object *o)
547 {
548         int rc = 0;
549         ENTRY;
550         mutex_lock(&o->mot_ioepoch_mutex);
551         if (o->mot_writecount > 0)
552                 rc = -ETXTBSY;
553         else
554                 o->mot_writecount--;
555         mutex_unlock(&o->mot_ioepoch_mutex);
556         RETURN(rc);
557 }
558
559 static void mdt_write_allow(struct mdt_object *o)
560 {
561         ENTRY;
562         mutex_lock(&o->mot_ioepoch_mutex);
563         o->mot_writecount++;
564         mutex_unlock(&o->mot_ioepoch_mutex);
565         EXIT;
566 }
567
568 /* there can be no real transaction so prepare the fake one */
569 static void mdt_empty_transno(struct mdt_thread_info *info, int rc)
570 {
571         struct mdt_device      *mdt = info->mti_mdt;
572         struct ptlrpc_request  *req = mdt_info_req(info);
573         struct tg_export_data  *ted;
574         struct lsd_client_data *lcd;
575
576         ENTRY;
577         /* transaction has occurred already */
578         if (lustre_msg_get_transno(req->rq_repmsg) != 0)
579                 RETURN_EXIT;
580
581         spin_lock(&mdt->mdt_lut.lut_translock);
582         if (rc != 0) {
583                 if (info->mti_transno != 0) {
584                         struct obd_export *exp = req->rq_export;
585
586                         CERROR("%s: replay trans "LPU64" NID %s: rc = %d\n",
587                                mdt_obd_name(mdt), info->mti_transno,
588                                libcfs_nid2str(exp->exp_connection->c_peer.nid),
589                                rc);
590                         spin_unlock(&mdt->mdt_lut.lut_translock);
591                         RETURN_EXIT;
592                 }
593         } else if (info->mti_transno == 0) {
594                 info->mti_transno = ++mdt->mdt_lut.lut_last_transno;
595         } else {
596                 /* should be replay */
597                 if (info->mti_transno > mdt->mdt_lut.lut_last_transno)
598                         mdt->mdt_lut.lut_last_transno = info->mti_transno;
599         }
600         spin_unlock(&mdt->mdt_lut.lut_translock);
601
602         CDEBUG(D_INODE, "transno = "LPU64", last_committed = "LPU64"\n",
603                info->mti_transno,
604                req->rq_export->exp_obd->obd_last_committed);
605
606         req->rq_transno = info->mti_transno;
607         lustre_msg_set_transno(req->rq_repmsg, info->mti_transno);
608
609         /* update lcd in memory only for resent cases */
610         ted = &req->rq_export->exp_target_data;
611         LASSERT(ted);
612         mutex_lock(&ted->ted_lcd_lock);
613         lcd = ted->ted_lcd;
614         if (info->mti_transno < lcd->lcd_last_transno &&
615             info->mti_transno != 0) {
616                 /* This should happen during replay. Do not update
617                  * last rcvd info if replay req transno < last transno,
618                  * otherwise the following resend(after replay) can not
619                  * be checked correctly by xid */
620                 mutex_unlock(&ted->ted_lcd_lock);
621                 CDEBUG(D_HA, "%s: transno = "LPU64" < last_transno = "LPU64"\n",
622                        mdt_obd_name(mdt), info->mti_transno,
623                        lcd->lcd_last_transno);
624                 RETURN_EXIT;
625         }
626
627         if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE ||
628             lustre_msg_get_opc(req->rq_reqmsg) == MDS_DONE_WRITING) {
629                 if (info->mti_transno != 0)
630                         lcd->lcd_last_close_transno = info->mti_transno;
631                 lcd->lcd_last_close_xid = req->rq_xid;
632                 lcd->lcd_last_close_result = rc;
633         } else {
634                 /* VBR: save versions in last_rcvd for reconstruct. */
635                 __u64 *pre_versions = lustre_msg_get_versions(req->rq_repmsg);
636                 if (pre_versions) {
637                         lcd->lcd_pre_versions[0] = pre_versions[0];
638                         lcd->lcd_pre_versions[1] = pre_versions[1];
639                         lcd->lcd_pre_versions[2] = pre_versions[2];
640                         lcd->lcd_pre_versions[3] = pre_versions[3];
641                 }
642                 if (info->mti_transno != 0)
643                         lcd->lcd_last_transno = info->mti_transno;
644
645                 lcd->lcd_last_xid = req->rq_xid;
646                 lcd->lcd_last_result = rc;
647                 lcd->lcd_last_data = info->mti_opdata;
648         }
649         mutex_unlock(&ted->ted_lcd_lock);
650
651         EXIT;
652 }
653
654 void mdt_mfd_set_mode(struct mdt_file_data *mfd, int mode)
655 {
656         LASSERT(mfd != NULL);
657
658         CDEBUG(D_HA, "Change mfd %p mode 0x%x->0x%x\n",
659                mfd, (unsigned int)mfd->mfd_mode, (unsigned int)mode);
660
661         mfd->mfd_mode = mode;
662 }
663
664 static int mdt_mfd_open(struct mdt_thread_info *info, struct mdt_object *p,
665                         struct mdt_object *o, __u64 flags, int created)
666 {
667         struct ptlrpc_request   *req = mdt_info_req(info);
668         struct mdt_export_data  *med = &req->rq_export->exp_mdt_data;
669         struct mdt_file_data    *mfd;
670         struct md_attr          *ma  = &info->mti_attr;
671         struct lu_attr          *la  = &ma->ma_attr;
672         struct mdt_body         *repbody;
673         int                      rc = 0, isdir, isreg;
674         ENTRY;
675
676         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
677
678         isreg = S_ISREG(la->la_mode);
679         isdir = S_ISDIR(la->la_mode);
680         if (isreg && !(ma->ma_valid & MA_LOV)) {
681                 /*
682                  * No EA, check whether it is will set regEA and dirEA since in
683                  * above attr get, these size might be zero, so reset it, to
684                  * retrieve the MD after create obj.
685                  */
686                 ma->ma_lmm_size = req_capsule_get_size(info->mti_pill,
687                                                        &RMF_MDT_MD,
688                                                        RCL_SERVER);
689                 /* in replay case, p == NULL */
690                 rc = mdt_create_data(info, p, o);
691                 if (rc)
692                         RETURN(rc);
693         }
694
695         CDEBUG(D_INODE, "after open, ma_valid bit = "LPX64" lmm_size = %d\n",
696                ma->ma_valid, ma->ma_lmm_size);
697
698         if (ma->ma_valid & MA_LOV) {
699                 LASSERT(ma->ma_lmm_size != 0);
700                 repbody->eadatasize = ma->ma_lmm_size;
701                 if (isdir)
702                         repbody->valid |= OBD_MD_FLDIREA;
703                 else
704                         repbody->valid |= OBD_MD_FLEASIZE;
705         }
706
707         if (flags & FMODE_WRITE) {
708                 rc = mdt_write_get(o);
709                 if (rc == 0) {
710                         mdt_ioepoch_open(info, o, created);
711                         repbody->ioepoch = o->mot_ioepoch;
712                 }
713         } else if (flags & MDS_FMODE_EXEC) {
714                 /* if file is released, we can't deny write because we must
715                  * restore (write) it to access it.*/
716                 if ((ma->ma_valid & MA_HSM) &&
717                     (ma->ma_hsm.mh_flags & HS_RELEASED))
718                         rc = 0;
719                 else
720                         rc = mdt_write_deny(o);
721         }
722         if (rc)
723                 RETURN(rc);
724
725         rc = mo_open(info->mti_env, mdt_object_child(o),
726                      created ? flags | MDS_OPEN_CREATED : flags);
727         if (rc)
728                 GOTO(err_out, rc);
729
730         mfd = mdt_mfd_new(med);
731         if (mfd == NULL)
732                 GOTO(err_out, rc = -ENOMEM);
733
734         /*
735          * Keep a reference on this object for this open, and is
736          * released by mdt_mfd_close().
737          */
738         mdt_object_get(info->mti_env, o);
739
740         /*
741          * @flags is always not zero. At least it should be FMODE_READ,
742          * FMODE_WRITE or MDS_FMODE_EXEC.
743          */
744         LASSERT(flags != 0);
745
746         /* Open handling. */
747         mdt_mfd_set_mode(mfd, flags);
748
749         mfd->mfd_object = o;
750         mfd->mfd_xid = req->rq_xid;
751
752         /* replay handle */
753         if (req_is_replay(req)) {
754                 struct mdt_file_data *old_mfd;
755                 /* Check wheather old cookie already exist in
756                  * the list, becasue when do recovery, client
757                  * might be disconnected from server, and
758                  * restart replay, so there maybe some orphan
759                  * mfd here, we should remove them */
760                 LASSERT(info->mti_rr.rr_handle != NULL);
761                 old_mfd = mdt_handle2mfd(med, info->mti_rr.rr_handle, true);
762                 if (old_mfd != NULL) {
763                         CDEBUG(D_HA, "delete orphan mfd = %p, fid = "DFID", "
764                                "cookie = "LPX64"\n", mfd,
765                                PFID(mdt_object_fid(mfd->mfd_object)),
766                                info->mti_rr.rr_handle->cookie);
767                         spin_lock(&med->med_open_lock);
768                         class_handle_unhash(&old_mfd->mfd_handle);
769                         cfs_list_del_init(&old_mfd->mfd_list);
770                         spin_unlock(&med->med_open_lock);
771                         /* no attr update for that close */
772                         la->la_valid = 0;
773                         ma->ma_valid |= MA_FLAGS;
774                         ma->ma_attr_flags |= MDS_RECOV_OPEN;
775                         mdt_mfd_close(info, old_mfd);
776                         ma->ma_attr_flags &= ~MDS_RECOV_OPEN;
777                         ma->ma_valid &= ~MA_FLAGS;
778                 }
779
780                 CDEBUG(D_HA, "Store old cookie "LPX64" in new mfd\n",
781                        info->mti_rr.rr_handle->cookie);
782
783                 mfd->mfd_old_handle.cookie = info->mti_rr.rr_handle->cookie;
784         }
785
786         repbody->handle.cookie = mfd->mfd_handle.h_cookie;
787
788         if (req->rq_export->exp_disconnected) {
789                 spin_lock(&med->med_open_lock);
790                 class_handle_unhash(&mfd->mfd_handle);
791                 cfs_list_del_init(&mfd->mfd_list);
792                 spin_unlock(&med->med_open_lock);
793                 mdt_mfd_close(info, mfd);
794         } else {
795                 spin_lock(&med->med_open_lock);
796                 cfs_list_add(&mfd->mfd_list, &med->med_open_head);
797                 spin_unlock(&med->med_open_lock);
798         }
799
800         mdt_empty_transno(info, rc);
801
802         RETURN(rc);
803
804 err_out:
805         if (flags & FMODE_WRITE)
806                         /* XXX We also need to close io epoch here.
807                          * See LU-1220 - green */
808                 mdt_write_put(o);
809         else if (flags & FMODE_EXEC)
810                 mdt_write_allow(o);
811         return rc;
812 }
813
814 int mdt_finish_open(struct mdt_thread_info *info,
815                     struct mdt_object *p, struct mdt_object *o,
816                     __u64 flags, int created, struct ldlm_reply *rep)
817 {
818         struct ptlrpc_request   *req = mdt_info_req(info);
819         struct obd_export       *exp = req->rq_export;
820         struct mdt_export_data  *med = &req->rq_export->exp_mdt_data;
821         struct md_attr          *ma  = &info->mti_attr;
822         struct lu_attr          *la  = &ma->ma_attr;
823         struct mdt_file_data    *mfd;
824         struct mdt_body         *repbody;
825         int                      rc = 0;
826         int                      isreg, isdir, islnk;
827         cfs_list_t              *t;
828         ENTRY;
829
830         LASSERT(ma->ma_valid & MA_INODE);
831
832         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
833
834         isreg = S_ISREG(la->la_mode);
835         isdir = S_ISDIR(la->la_mode);
836         islnk = S_ISLNK(la->la_mode);
837         mdt_pack_attr2body(info, repbody, la, mdt_object_fid(o));
838
839         /* LU-2275, simulate broken behaviour (esp. prevalent in
840          * pre-2.4 servers where a very strange reply is sent on error
841          * that looks like it was actually almost succesful and a failure at the
842          * same time */
843         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_NEGATIVE_POSITIVE)) {
844                 mdt_set_disposition(info, rep, DISP_OPEN_OPEN |
845                                                DISP_LOOKUP_NEG |
846                                                DISP_LOOKUP_POS);
847
848                 if (flags & MDS_OPEN_LOCK)
849                         mdt_set_disposition(info, rep, DISP_OPEN_LOCK);
850
851                 RETURN(-ENOENT);
852         }
853
854         if (exp_connect_rmtclient(exp)) {
855                 void *buf = req_capsule_server_get(info->mti_pill, &RMF_ACL);
856
857                 rc = mdt_pack_remote_perm(info, o, buf);
858                 if (rc) {
859                         repbody->valid &= ~OBD_MD_FLRMTPERM;
860                         repbody->aclsize = 0;
861                 } else {
862                         repbody->valid |= OBD_MD_FLRMTPERM;
863                         repbody->aclsize = sizeof(struct mdt_remote_perm);
864                 }
865         }
866 #ifdef CONFIG_FS_POSIX_ACL
867         else if (exp_connect_flags(exp) & OBD_CONNECT_ACL) {
868                 const struct lu_env *env = info->mti_env;
869                 struct md_object *next = mdt_object_child(o);
870                 struct lu_buf *buf = &info->mti_buf;
871
872                 buf->lb_buf = req_capsule_server_get(info->mti_pill, &RMF_ACL);
873                 buf->lb_len = req_capsule_get_size(info->mti_pill, &RMF_ACL,
874                                                    RCL_SERVER);
875                 if (buf->lb_len > 0) {
876                         rc = mo_xattr_get(env, next, buf,
877                                           XATTR_NAME_ACL_ACCESS);
878                         if (rc < 0) {
879                                 if (rc == -ENODATA) {
880                                         repbody->aclsize = 0;
881                                         repbody->valid |= OBD_MD_FLACL;
882                                         rc = 0;
883                                 } else if (rc == -EOPNOTSUPP) {
884                                         rc = 0;
885                                 } else {
886                                         CERROR("got acl size: %d\n", rc);
887                                 }
888                         } else {
889                                 repbody->aclsize = rc;
890                                 repbody->valid |= OBD_MD_FLACL;
891                                 rc = 0;
892                         }
893                 }
894         }
895 #endif
896
897         if (info->mti_mdt->mdt_opts.mo_mds_capa &&
898             exp_connect_flags(exp) & OBD_CONNECT_MDS_CAPA) {
899                 struct lustre_capa *capa;
900
901                 capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA1);
902                 LASSERT(capa);
903                 capa->lc_opc = CAPA_OPC_MDS_DEFAULT;
904                 rc = mo_capa_get(info->mti_env, mdt_object_child(o), capa, 0);
905                 if (rc)
906                         RETURN(rc);
907                 repbody->valid |= OBD_MD_FLMDSCAPA;
908         }
909
910         if (info->mti_mdt->mdt_opts.mo_oss_capa &&
911             exp_connect_flags(exp) & OBD_CONNECT_OSS_CAPA &&
912             S_ISREG(lu_object_attr(&o->mot_obj))) {
913                 struct lustre_capa *capa;
914
915                 capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA2);
916                 LASSERT(capa);
917                 capa->lc_opc = CAPA_OPC_OSS_DEFAULT | capa_open_opc(flags);
918                 rc = mo_capa_get(info->mti_env, mdt_object_child(o), capa, 0);
919                 if (rc)
920                         RETURN(rc);
921                 repbody->valid |= OBD_MD_FLOSSCAPA;
922         }
923
924         /*
925          * If we are following a symlink, don't open; and do not return open
926          * handle for special nodes as client required.
927          */
928         if (islnk || (!isreg && !isdir &&
929             (exp_connect_flags(req->rq_export) & OBD_CONNECT_NODEVOH))) {
930                 lustre_msg_set_transno(req->rq_repmsg, 0);
931                 RETURN(0);
932         }
933
934         /*
935          * We need to return the existing object's fid back, so it is done here,
936          * after preparing the reply.
937          */
938         if (!created && (flags & MDS_OPEN_EXCL) && (flags & MDS_OPEN_CREAT))
939                 RETURN(-EEXIST);
940
941         /* This can't be done earlier, we need to return reply body */
942         if (isdir) {
943                 if (flags & (MDS_OPEN_CREAT | FMODE_WRITE)) {
944                         /* We are trying to create or write an existing dir. */
945                         RETURN(-EISDIR);
946                 }
947         } else if (flags & MDS_OPEN_DIRECTORY)
948                 RETURN(-ENOTDIR);
949
950         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_OPEN_CREATE,
951                                  OBD_FAIL_LDLM_REPLY | OBD_FAIL_ONCE)) {
952                 RETURN(-EAGAIN);
953         }
954
955         mfd = NULL;
956         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
957                 spin_lock(&med->med_open_lock);
958                 cfs_list_for_each(t, &med->med_open_head) {
959                         mfd = cfs_list_entry(t, struct mdt_file_data, mfd_list);
960                         if (mfd->mfd_xid == req->rq_xid)
961                                 break;
962                         mfd = NULL;
963                 }
964                 spin_unlock(&med->med_open_lock);
965
966                 if (mfd != NULL) {
967                         repbody->handle.cookie = mfd->mfd_handle.h_cookie;
968                         /*set repbody->ea_size for resent case*/
969                         if (ma->ma_valid & MA_LOV) {
970                                 LASSERT(ma->ma_lmm_size != 0);
971                                 repbody->eadatasize = ma->ma_lmm_size;
972                                 if (isdir)
973                                         repbody->valid |= OBD_MD_FLDIREA;
974                                 else
975                                         repbody->valid |= OBD_MD_FLEASIZE;
976                         }
977                         mdt_set_disposition(info, rep, DISP_OPEN_OPEN);
978                         RETURN(0);
979                 }
980         }
981
982         rc = mdt_mfd_open(info, p, o, flags, created);
983         if (!rc)
984                 mdt_set_disposition(info, rep, DISP_OPEN_OPEN);
985
986         RETURN(rc);
987 }
988
989 extern void mdt_req_from_lcd(struct ptlrpc_request *req,
990                              struct lsd_client_data *lcd);
991
992 void mdt_reconstruct_open(struct mdt_thread_info *info,
993                           struct mdt_lock_handle *lhc)
994 {
995         const struct lu_env *env = info->mti_env;
996         struct mdt_device       *mdt  = info->mti_mdt;
997         struct req_capsule      *pill = info->mti_pill;
998         struct ptlrpc_request   *req  = mdt_info_req(info);
999         struct tg_export_data   *ted  = &req->rq_export->exp_target_data;
1000         struct lsd_client_data  *lcd  = ted->ted_lcd;
1001         struct md_attr          *ma   = &info->mti_attr;
1002         struct mdt_reint_record *rr   = &info->mti_rr;
1003         __u32                   flags = info->mti_spec.sp_cr_flags;
1004         struct ldlm_reply       *ldlm_rep;
1005         struct mdt_object       *parent;
1006         struct mdt_object       *child;
1007         struct mdt_body         *repbody;
1008         int                      rc;
1009         ENTRY;
1010
1011         LASSERT(pill->rc_fmt == &RQF_LDLM_INTENT_OPEN);
1012         ldlm_rep = req_capsule_server_get(pill, &RMF_DLM_REP);
1013         repbody = req_capsule_server_get(pill, &RMF_MDT_BODY);
1014
1015         ma->ma_lmm = req_capsule_server_get(pill, &RMF_MDT_MD);
1016         ma->ma_lmm_size = req_capsule_get_size(pill, &RMF_MDT_MD,
1017                                                RCL_SERVER);
1018         ma->ma_need = MA_INODE | MA_HSM;
1019         if (ma->ma_lmm_size > 0)
1020                 ma->ma_need |= MA_LOV;
1021
1022         ma->ma_valid = 0;
1023
1024         mdt_req_from_lcd(req, lcd);
1025         mdt_set_disposition(info, ldlm_rep, lcd->lcd_last_data);
1026
1027         CDEBUG(D_INODE, "This is reconstruct open: disp="LPX64", result=%d\n",
1028                ldlm_rep->lock_policy_res1, req->rq_status);
1029
1030         if (mdt_get_disposition(ldlm_rep, DISP_OPEN_CREATE) &&
1031             req->rq_status != 0)
1032                 /* We did not create successfully, return error to client. */
1033                 GOTO(out, rc = req->rq_status);
1034
1035         if (mdt_get_disposition(ldlm_rep, DISP_OPEN_CREATE)) {
1036                 struct obd_export *exp = req->rq_export;
1037                 /*
1038                  * We failed after creation, but we do not know in which step
1039                  * we failed. So try to check the child object.
1040                  */
1041                 parent = mdt_object_find(env, mdt, rr->rr_fid1);
1042                 if (IS_ERR(parent)) {
1043                         rc = PTR_ERR(parent);
1044                         LCONSOLE_WARN("Parent "DFID" lookup error %d."
1045                                       " Evicting client %s with export %s.\n",
1046                                       PFID(rr->rr_fid1), rc,
1047                                       obd_uuid2str(&exp->exp_client_uuid),
1048                                       obd_export_nid2str(exp));
1049                         mdt_export_evict(exp);
1050                         RETURN_EXIT;
1051                 }
1052                 child = mdt_object_find(env, mdt, rr->rr_fid2);
1053                 if (IS_ERR(child)) {
1054                         rc = PTR_ERR(child);
1055                         LCONSOLE_WARN("Child "DFID" lookup error %d."
1056                                       " Evicting client %s with export %s.\n",
1057                                       PFID(mdt_object_fid(child)), rc,
1058                                       obd_uuid2str(&exp->exp_client_uuid),
1059                                       obd_export_nid2str(exp));
1060                         mdt_object_put(env, parent);
1061                         mdt_export_evict(exp);
1062                         RETURN_EXIT;
1063                 }
1064
1065                 if (unlikely(mdt_object_remote(child))) {
1066                         /* the child object was created on remote server */
1067                         if (!mdt_is_dne_client(exp)) {
1068                                 /* Return -EIO for old client */
1069                                 mdt_object_put(env, parent);
1070                                 mdt_object_put(env, child);
1071                                 GOTO(out, rc = -EIO);
1072                         }
1073                         repbody->fid1 = *rr->rr_fid2;
1074                         repbody->valid |= (OBD_MD_FLID | OBD_MD_MDS);
1075                         rc = 0;
1076                 } else {
1077                         if (mdt_object_exists(child)) {
1078                                 mdt_set_capainfo(info, 1, rr->rr_fid2,
1079                                                  BYPASS_CAPA);
1080                                 rc = mdt_attr_get_complex(info, child, ma);
1081                                 if (rc == 0)
1082                                         rc = mdt_finish_open(info, parent,
1083                                                              child, flags,
1084                                                              1, ldlm_rep);
1085                         } else {
1086                                 /* the child does not exist, we should do
1087                                  * regular open */
1088                                 mdt_object_put(env, parent);
1089                                 mdt_object_put(env, child);
1090                                 GOTO(regular_open, 0);
1091                         }
1092                 }
1093                 mdt_object_put(env, parent);
1094                 mdt_object_put(env, child);
1095                 GOTO(out, rc);
1096         } else {
1097 regular_open:
1098                 /* We did not try to create, so we are a pure open */
1099                 rc = mdt_reint_open(info, lhc);
1100         }
1101
1102         EXIT;
1103 out:
1104         req->rq_status = rc;
1105         lustre_msg_set_status(req->rq_repmsg, req->rq_status);
1106         LASSERT(ergo(rc < 0, lustre_msg_get_transno(req->rq_repmsg) == 0));
1107 }
1108
1109 int mdt_open_by_fid(struct mdt_thread_info* info,
1110                     struct ldlm_reply *rep)
1111 {
1112         __u32                    flags = info->mti_spec.sp_cr_flags;
1113         struct mdt_reint_record *rr = &info->mti_rr;
1114         struct md_attr          *ma = &info->mti_attr;
1115         struct mdt_object       *o;
1116         int                      rc;
1117         ENTRY;
1118
1119         o = mdt_object_find(info->mti_env, info->mti_mdt, rr->rr_fid2);
1120         if (IS_ERR(o))
1121                 RETURN(rc = PTR_ERR(o));
1122
1123         if (unlikely(mdt_object_remote(o))) {
1124                 /* the child object was created on remote server */
1125                 struct mdt_body *repbody;
1126
1127                 mdt_set_disposition(info, rep, (DISP_IT_EXECD |
1128                                                 DISP_LOOKUP_EXECD |
1129                                                 DISP_LOOKUP_POS));
1130                 repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
1131                 repbody->fid1 = *rr->rr_fid2;
1132                 repbody->valid |= (OBD_MD_FLID | OBD_MD_MDS);
1133                 rc = 0;
1134         } else {
1135                 if (mdt_object_exists(o)) {
1136                         mdt_set_disposition(info, rep, (DISP_IT_EXECD |
1137                                                         DISP_LOOKUP_EXECD |
1138                                                         DISP_LOOKUP_POS));
1139
1140                         rc = mdt_attr_get_complex(info, o, ma);
1141                         if (rc == 0)
1142                                 rc = mdt_finish_open(info, NULL, o, flags, 0,
1143                                                      rep);
1144                 } else {
1145                         rc = -ENOENT;
1146                 }
1147         }
1148
1149         mdt_object_put(info->mti_env, o);
1150         RETURN(rc);
1151 }
1152
1153 /* lock object for open */
1154 static int mdt_object_open_lock(struct mdt_thread_info *info,
1155                                 struct mdt_object *obj,
1156                                 struct mdt_lock_handle *lhc,
1157                                 __u64 *ibits)
1158 {
1159         struct md_attr  *ma = &info->mti_attr;
1160         __u64            open_flags = info->mti_spec.sp_cr_flags;
1161         ldlm_mode_t      lm = LCK_CR;
1162         bool             try_layout = false;
1163         bool             create_layout = false;
1164         int              rc = 0;
1165         ENTRY;
1166
1167         *ibits = 0;
1168         if (open_flags & MDS_OPEN_LOCK) {
1169                 if (open_flags & FMODE_WRITE)
1170                         lm = LCK_CW;
1171                 /* if file is released, we can't deny write because we must
1172                  * restore (write) it to access it. */
1173                 else if ((open_flags & MDS_FMODE_EXEC) &&
1174                          !((ma->ma_valid & MA_HSM) &&
1175                            (ma->ma_hsm.mh_flags & HS_RELEASED)))
1176                         lm = LCK_PR;
1177                 else
1178                         lm = LCK_CR;
1179
1180                 *ibits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_OPEN;
1181         }
1182
1183         if (S_ISREG(lu_object_attr(&obj->mot_obj))) {
1184                 if (ma->ma_need & MA_LOV && !(ma->ma_valid & MA_LOV) &&
1185                     md_should_create(open_flags))
1186                         create_layout = true;
1187                 if (exp_connect_layout(info->mti_exp) && !create_layout &&
1188                     ma->ma_need & MA_LOV)
1189                         try_layout = true;
1190         }
1191
1192         mdt_lock_handle_init(lhc);
1193         mdt_lock_reg_init(lhc, lm);
1194
1195         /* one problem to return layout lock on open is that it may result
1196          * in too many layout locks cached on the client side. */
1197         if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_NO_LL_OPEN) && try_layout) {
1198                 /* return lookup lock to validate inode at the client side,
1199                  * this is pretty important otherwise mdt will return layout
1200                  * lock for each open.
1201                  * However this is a double-edged sword because changing
1202                  * permission will revoke huge # of LOOKUP locks. */
1203                 *ibits |= MDS_INODELOCK_LAYOUT | MDS_INODELOCK_LOOKUP;
1204                 if (!mdt_object_lock_try(info, obj, lhc, *ibits,
1205                                          MDT_CROSS_LOCK)) {
1206                         *ibits &= ~(MDS_INODELOCK_LAYOUT|MDS_INODELOCK_LOOKUP);
1207                         if (*ibits != 0)
1208                                 rc = mdt_object_lock(info, obj, lhc, *ibits,
1209                                                 MDT_CROSS_LOCK);
1210                 }
1211         } else if (*ibits != 0) {
1212                 rc = mdt_object_lock(info, obj, lhc, *ibits, MDT_CROSS_LOCK);
1213         }
1214
1215         CDEBUG(D_INODE, "Requested bits lock:"DFID ", ibits = "LPX64
1216                 ", open_flags = "LPO64", try_layout = %d, rc = %d\n",
1217                 PFID(mdt_object_fid(obj)), *ibits, open_flags, try_layout, rc);
1218
1219         /* will change layout, revoke layout locks by enqueuing EX lock. */
1220         if (rc == 0 && create_layout) {
1221                 struct mdt_lock_handle *ll = &info->mti_lh[MDT_LH_LAYOUT];
1222
1223                 CDEBUG(D_INODE, "Will create layout, get EX layout lock:"DFID
1224                         ", open_flags = "LPO64"\n",
1225                         PFID(mdt_object_fid(obj)), open_flags);
1226
1227                 LASSERT(!try_layout);
1228                 mdt_lock_handle_init(ll);
1229                 mdt_lock_reg_init(ll, LCK_EX);
1230                 rc = mdt_object_lock(info, obj, ll, MDS_INODELOCK_LAYOUT,
1231                                         MDT_LOCAL_LOCK);
1232
1233                 OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LL_BLOCK, 2);
1234         }
1235
1236         RETURN(rc);
1237 }
1238
1239 static void mdt_object_open_unlock(struct mdt_thread_info *info,
1240                                    struct mdt_object *obj,
1241                                    struct mdt_lock_handle *lhc,
1242                                    __u64 ibits, int rc)
1243 {
1244         __u64 open_flags = info->mti_spec.sp_cr_flags;
1245         struct mdt_lock_handle *ll = &info->mti_lh[MDT_LH_LAYOUT];
1246
1247         /* Release local layout lock - the layout lock put in MDT_LH_LAYOUT
1248          * will never return to client side. */
1249         if (lustre_handle_is_used(&ll->mlh_reg_lh)) {
1250                 LASSERT(!(ibits & MDS_INODELOCK_LAYOUT));
1251                 mdt_object_unlock(info, obj, ll, 1);
1252         }
1253
1254         /* Cross-ref case, the lock should be returned to the client */
1255         if (ibits == 0 || rc == -EREMOTE)
1256                 return;
1257
1258         if (!(open_flags & MDS_OPEN_LOCK) && !(ibits & MDS_INODELOCK_LAYOUT)) {
1259                 /* for the open request, the lock will only return to client
1260                  * if open or layout lock is granted. */
1261                 rc = 1;
1262         }
1263
1264         if (rc != 0) {
1265                 struct ldlm_reply       *ldlm_rep;
1266
1267                 ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
1268                 mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_LOCK);
1269                 mdt_object_unlock(info, obj, lhc, 1);
1270         }
1271 }
1272
1273 int mdt_open_by_fid_lock(struct mdt_thread_info *info, struct ldlm_reply *rep,
1274                          struct mdt_lock_handle *lhc)
1275 {
1276         const struct lu_env     *env   = info->mti_env;
1277         struct mdt_device       *mdt   = info->mti_mdt;
1278         __u32                    flags = info->mti_spec.sp_cr_flags;
1279         struct mdt_reint_record *rr    = &info->mti_rr;
1280         struct md_attr          *ma    = &info->mti_attr;
1281         struct mdt_object       *parent= NULL;
1282         struct mdt_object       *o;
1283         int                      rc;
1284         __u64                    ibits = 0;
1285         ENTRY;
1286
1287         if (md_should_create(flags) && !(flags & MDS_OPEN_HAS_EA)) {
1288                 if (!lu_fid_eq(rr->rr_fid1, rr->rr_fid2)) {
1289                         parent = mdt_object_find(env, mdt, rr->rr_fid1);
1290                         if (IS_ERR(parent)) {
1291                                 CDEBUG(D_INODE, "Fail to find parent "DFID
1292                                        " for anonymous created %ld, try to"
1293                                        " use server-side parent.\n",
1294                                        PFID(rr->rr_fid1), PTR_ERR(parent));
1295                                 parent = NULL;
1296                         }
1297                 }
1298                 if (parent == NULL)
1299                         ma->ma_need |= MA_PFID;
1300         }
1301
1302         o = mdt_object_find(env, mdt, rr->rr_fid2);
1303         if (IS_ERR(o))
1304                 RETURN(rc = PTR_ERR(o));
1305
1306         if (mdt_object_remote(o)) {
1307                 CDEBUG(D_INFO, "%s: "DFID" is on remote MDT.\n",
1308                        mdt_obd_name(info->mti_mdt),
1309                        PFID(rr->rr_fid2));
1310                 GOTO(out, rc = -EREMOTE);
1311         } else if (!mdt_object_exists(o)) {
1312                 mdt_set_disposition(info, rep,
1313                                     DISP_IT_EXECD |
1314                                     DISP_LOOKUP_EXECD |
1315                                     DISP_LOOKUP_NEG);
1316                 GOTO(out, rc = -ENOENT);
1317         }
1318
1319         mdt_set_disposition(info, rep, (DISP_IT_EXECD | DISP_LOOKUP_EXECD));
1320
1321         rc = mdt_attr_get_complex(info, o, ma);
1322         if (rc)
1323                 GOTO(out, rc);
1324
1325         rc = mdt_object_open_lock(info, o, lhc, &ibits);
1326         if (rc)
1327                 GOTO(out, rc);
1328
1329         if (ma->ma_valid & MA_PFID) {
1330                 parent = mdt_object_find(env, mdt, &ma->ma_pfid);
1331                 if (IS_ERR(parent)) {
1332                         CDEBUG(D_INODE, "Fail to find parent "DFID
1333                                " for anonymous created %ld, try to"
1334                                " use system default.\n",
1335                                PFID(&ma->ma_pfid), PTR_ERR(parent));
1336                         parent = NULL;
1337                 }
1338         }
1339
1340         rc = mdt_finish_open(info, parent, o, flags, 0, rep);
1341         if (!rc) {
1342                 mdt_set_disposition(info, rep, DISP_LOOKUP_POS);
1343                 if (flags & MDS_OPEN_LOCK)
1344                         mdt_set_disposition(info, rep, DISP_OPEN_LOCK);
1345         }
1346         GOTO(out, rc);
1347
1348 out:
1349         mdt_object_open_unlock(info, o, lhc, ibits, rc);
1350         mdt_object_put(env, o);
1351         if (parent != NULL)
1352                 mdt_object_put(env, parent);
1353         return rc;
1354 }
1355
1356 int mdt_pin(struct mdt_thread_info* info)
1357 {
1358         ENTRY;
1359         RETURN(err_serious(-EOPNOTSUPP));
1360 }
1361
1362 /* Cross-ref request. Currently it can only be a pure open (w/o create) */
1363 static int mdt_cross_open(struct mdt_thread_info *info,
1364                           const struct lu_fid *parent_fid,
1365                           const struct lu_fid *fid,
1366                           struct ldlm_reply *rep, __u32 flags)
1367 {
1368         struct md_attr    *ma = &info->mti_attr;
1369         struct mdt_object *o;
1370         int                rc;
1371         ENTRY;
1372
1373         o = mdt_object_find(info->mti_env, info->mti_mdt, fid);
1374         if (IS_ERR(o))
1375                 RETURN(rc = PTR_ERR(o));
1376
1377         if (mdt_object_remote(o)) {
1378                 /* Something is wrong here, the object is on another MDS! */
1379                 CERROR("%s: "DFID" isn't on this server!: rc = %d\n",
1380                        mdt_obd_name(info->mti_mdt), PFID(fid), -EFAULT);
1381                 LU_OBJECT_DEBUG(D_WARNING, info->mti_env,
1382                                 &o->mot_obj,
1383                                 "Object isn't on this server! FLD error?\n");
1384                 rc = -EFAULT;
1385         } else {
1386                 if (mdt_object_exists(o)) {
1387                         /* Do permission check for cross-open. */
1388                         rc = mo_permission(info->mti_env, NULL,
1389                                            mdt_object_child(o),
1390                                            NULL, flags | MDS_OPEN_CROSS);
1391                         if (rc)
1392                                 goto out;
1393
1394                         mdt_set_capainfo(info, 0, fid, BYPASS_CAPA);
1395                         rc = mdt_attr_get_complex(info, o, ma);
1396                         if (rc != 0)
1397                                 GOTO(out, rc);
1398
1399                         /* Do not create lov object if the fid is opened
1400                          * under OBF */
1401                         if (S_ISREG(ma->ma_attr.la_mode) &&
1402                             !(ma->ma_valid & MA_LOV) && (flags & FMODE_WRITE) &&
1403                             fid_is_obf(parent_fid))
1404                                 GOTO(out, rc = -EPERM);
1405
1406                         rc = mdt_finish_open(info, NULL, o, flags, 0, rep);
1407                 } else {
1408                         /*
1409                          * Something is wrong here. lookup was positive but
1410                          * there is no object!
1411                          */
1412                         CERROR("%s: "DFID" doesn't exist!: rc = %d\n",
1413                               mdt_obd_name(info->mti_mdt), PFID(fid), -EFAULT);
1414                         rc = -EFAULT;
1415                 }
1416         }
1417 out:
1418         mdt_object_put(info->mti_env, o);
1419         RETURN(rc);
1420 }
1421
1422 int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc)
1423 {
1424         struct mdt_device       *mdt = info->mti_mdt;
1425         struct ptlrpc_request   *req = mdt_info_req(info);
1426         struct mdt_object       *parent;
1427         struct mdt_object       *child;
1428         struct mdt_lock_handle  *lh;
1429         struct ldlm_reply       *ldlm_rep;
1430         struct mdt_body         *repbody;
1431         struct lu_fid           *child_fid = &info->mti_tmp_fid1;
1432         struct md_attr          *ma = &info->mti_attr;
1433         __u64                    create_flags = info->mti_spec.sp_cr_flags;
1434         __u64                    ibits;
1435         struct mdt_reint_record *rr = &info->mti_rr;
1436         struct lu_name          *lname;
1437         int                      result, rc;
1438         int                      created = 0;
1439         __u32                    msg_flags;
1440         ENTRY;
1441
1442         OBD_FAIL_TIMEOUT_ORSET(OBD_FAIL_MDS_PAUSE_OPEN, OBD_FAIL_ONCE,
1443                                (obd_timeout + 1) / 4);
1444
1445         mdt_counter_incr(req, LPROC_MDT_OPEN);
1446         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
1447
1448         ma->ma_lmm = req_capsule_server_get(info->mti_pill, &RMF_MDT_MD);
1449         ma->ma_lmm_size = req_capsule_get_size(info->mti_pill, &RMF_MDT_MD,
1450                                                RCL_SERVER);
1451         ma->ma_need = MA_INODE;
1452         if (ma->ma_lmm_size > 0)
1453                 ma->ma_need |= MA_LOV;
1454
1455         ma->ma_valid = 0;
1456
1457         LASSERT(info->mti_pill->rc_fmt == &RQF_LDLM_INTENT_OPEN);
1458         ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
1459
1460         if (unlikely(create_flags & MDS_OPEN_JOIN_FILE)) {
1461                 CERROR("file join is not supported anymore.\n");
1462                 GOTO(out, result = err_serious(-EOPNOTSUPP));
1463         }
1464         msg_flags = lustre_msg_get_flags(req->rq_reqmsg);
1465
1466         if ((create_flags & (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS)) &&
1467             info->mti_spec.u.sp_ea.eadata == NULL)
1468                 GOTO(out, result = err_serious(-EINVAL));
1469
1470         CDEBUG(D_INODE, "I am going to open "DFID"/(%s->"DFID") "
1471                "cr_flag="LPO64" mode=0%06o msg_flag=0x%x\n",
1472                PFID(rr->rr_fid1), rr->rr_name,
1473                PFID(rr->rr_fid2), create_flags,
1474                ma->ma_attr.la_mode, msg_flags);
1475         if (info->mti_cross_ref) {
1476                 /* This is cross-ref open */
1477                 mdt_set_disposition(info, ldlm_rep,
1478                             (DISP_IT_EXECD | DISP_LOOKUP_EXECD |
1479                              DISP_LOOKUP_POS));
1480                 result = mdt_cross_open(info, rr->rr_fid2, rr->rr_fid1,
1481                                         ldlm_rep, create_flags);
1482                 GOTO(out, result);
1483         } else if (req_is_replay(req) ||
1484             (req->rq_export->exp_libclient && create_flags & MDS_OPEN_HAS_EA)) {
1485                 /* This is a replay request or from liblustre with ea. */
1486                 result = mdt_open_by_fid(info, ldlm_rep);
1487
1488                 if (result != -ENOENT) {
1489                         if (req->rq_export->exp_libclient &&
1490                             create_flags & MDS_OPEN_HAS_EA)
1491                                 GOTO(out, result = 0);
1492                         GOTO(out, result);
1493                 }
1494                 /* We didn't find the correct object, so we need to re-create it
1495                  * via a regular replay. */
1496                 if (!(create_flags & MDS_OPEN_CREAT)) {
1497                         DEBUG_REQ(D_ERROR, req,
1498                                   "OPEN & CREAT not in open replay/by_fid.");
1499                         GOTO(out, result = -EFAULT);
1500                 }
1501                 CDEBUG(D_INFO, "No object(1), continue as regular open.\n");
1502         } else if ((rr->rr_namelen == 0 && create_flags & MDS_OPEN_LOCK) ||
1503                    (create_flags & MDS_OPEN_BY_FID)) {
1504                 result = mdt_open_by_fid_lock(info, ldlm_rep, lhc);
1505                 /* If result is 0 then open by FID has found the file
1506                  * and there is nothing left for us to do here.  More
1507                  * generally if it is anything other than -ENOENT or
1508                  * -EREMOTE then we return that now.  If -ENOENT and
1509                  * MDS_OPEN_CREAT is set then we must create the file
1510                  * below.  If -EREMOTE then we need to return a LOOKUP
1511                  * lock to the client, which we do below.  Hence this
1512                  * odd looking condition.  See LU-2523. */
1513                 if (!(result == -ENOENT && (create_flags & MDS_OPEN_CREAT)) &&
1514                     result != -EREMOTE)
1515                         GOTO(out, result);
1516
1517                 if (unlikely(rr->rr_namelen == 0))
1518                         GOTO(out, result = -EINVAL);
1519
1520                 CDEBUG(D_INFO, "No object(2), continue as regular open.\n");
1521         }
1522
1523         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OPEN_PACK))
1524                 GOTO(out, result = err_serious(-ENOMEM));
1525
1526         mdt_set_disposition(info, ldlm_rep,
1527                             (DISP_IT_EXECD | DISP_LOOKUP_EXECD));
1528
1529         lh = &info->mti_lh[MDT_LH_PARENT];
1530         mdt_lock_pdo_init(lh, (create_flags & MDS_OPEN_CREAT) ?
1531                           LCK_PW : LCK_PR, rr->rr_name, rr->rr_namelen);
1532
1533         parent = mdt_object_find_lock(info, rr->rr_fid1, lh,
1534                                       MDS_INODELOCK_UPDATE);
1535         if (IS_ERR(parent))
1536                 GOTO(out, result = PTR_ERR(parent));
1537
1538         /* get and check version of parent */
1539         result = mdt_version_get_check(info, parent, 0);
1540         if (result)
1541                 GOTO(out_parent, result);
1542
1543         fid_zero(child_fid);
1544
1545         lname = mdt_name(info->mti_env, (char *)rr->rr_name, rr->rr_namelen);
1546         result = mdo_lookup(info->mti_env, mdt_object_child(parent),
1547                             lname, child_fid, &info->mti_spec);
1548         LASSERTF(ergo(result == 0, fid_is_sane(child_fid)),
1549                  "looking for "DFID"/%s, result fid="DFID"\n",
1550                  PFID(mdt_object_fid(parent)), rr->rr_name, PFID(child_fid));
1551
1552         if (result != 0 && result != -ENOENT && result != -ESTALE)
1553                 GOTO(out_parent, result);
1554
1555         if (result == -ENOENT || result == -ESTALE) {
1556                 mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_NEG);
1557                 if (result == -ESTALE) {
1558                         /*
1559                          * -ESTALE means the parent is a dead(unlinked) dir, so
1560                          * it should return -ENOENT to in accordance with the
1561                          * original mds implementaion.
1562                          */
1563                         GOTO(out_parent, result = -ENOENT);
1564                 }
1565                 if (!(create_flags & MDS_OPEN_CREAT))
1566                         GOTO(out_parent, result);
1567                 *child_fid = *info->mti_rr.rr_fid2;
1568                 LASSERTF(fid_is_sane(child_fid), "fid="DFID"\n",
1569                          PFID(child_fid));
1570                 /* In the function below, .hs_keycmp resolves to
1571                  * lu_obj_hop_keycmp() */
1572                 /* coverity[overrun-buffer-val] */
1573                 child = mdt_object_new(info->mti_env, mdt, child_fid);
1574         } else {
1575                 /*
1576                  * Check for O_EXCL is moved to the mdt_finish_open(), we need to
1577                  * return FID back in that case.
1578                  */
1579                 mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_POS);
1580                 child = mdt_object_find(info->mti_env, mdt, child_fid);
1581         }
1582         if (IS_ERR(child))
1583                 GOTO(out_parent, result = PTR_ERR(child));
1584
1585         /** check version of child  */
1586         rc = mdt_version_get_check(info, child, 1);
1587         if (rc)
1588                 GOTO(out_child, result = rc);
1589
1590         mdt_set_capainfo(info, 1, child_fid, BYPASS_CAPA);
1591         if (result == -ENOENT) {
1592                 /* Create under OBF and .lustre is not permitted */
1593                 if (fid_is_obf(rr->rr_fid1) || fid_is_dot_lustre(rr->rr_fid1))
1594                         GOTO(out_child, result = -EPERM);
1595
1596                 /* save versions in reply */
1597                 mdt_version_get_save(info, parent, 0);
1598                 mdt_version_get_save(info, child, 1);
1599
1600                 /* version of child will be changed */
1601                 info->mti_mos = child;
1602
1603                 /* Not found and with MDS_OPEN_CREAT: let's create it. */
1604                 mdt_set_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
1605
1606                 /* Let lower layers know what is lock mode on directory. */
1607                 info->mti_spec.sp_cr_mode =
1608                         mdt_dlm_mode2mdl_mode(lh->mlh_pdo_mode);
1609
1610                 /*
1611                  * Do not perform lookup sanity check. We know that name does
1612                  * not exist.
1613                  */
1614                 info->mti_spec.sp_cr_lookup = 0;
1615                 info->mti_spec.sp_feat = &dt_directory_features;
1616
1617                 result = mdo_create(info->mti_env,
1618                                     mdt_object_child(parent),
1619                                     lname,
1620                                     mdt_object_child(child),
1621                                     &info->mti_spec,
1622                                     &info->mti_attr);
1623                 if (result == -ERESTART) {
1624                         mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
1625                         GOTO(out_child, result);
1626                 } else {
1627
1628                         /* XXX: we should call this once, see few lines below */
1629                         if (result == 0)
1630                                 result = mdt_attr_get_complex(info, child, ma);
1631
1632                         if (result != 0)
1633                                 GOTO(out_child, result);
1634                 }
1635                 created = 1;
1636         } else {
1637                 /*
1638                  * The object is on remote node, return its FID for remote open.
1639                  */
1640                 if (mdt_object_remote(child)) {
1641                         /*
1642                          * Check if this lock already was sent to client and
1643                          * this is resent case. For resent case do not take lock
1644                          * again, use what is already granted.
1645                          */
1646                         LASSERT(lhc != NULL);
1647
1648                         if (lustre_handle_is_used(&lhc->mlh_reg_lh)) {
1649                                 struct ldlm_lock *lock;
1650
1651                                 LASSERT(msg_flags & MSG_RESENT);
1652
1653                                 lock = ldlm_handle2lock(&lhc->mlh_reg_lh);
1654                                 if (!lock) {
1655                                         CERROR("Invalid lock handle "LPX64"\n",
1656                                                lhc->mlh_reg_lh.cookie);
1657                                         LBUG();
1658                                 }
1659                                 LASSERT(fid_res_name_eq(mdt_object_fid(child),
1660                                                         &lock->l_resource->lr_name));
1661                                 LDLM_LOCK_PUT(lock);
1662                                 rc = 0;
1663                         } else {
1664                                 mdt_lock_handle_init(lhc);
1665                                 mdt_lock_reg_init(lhc, LCK_PR);
1666
1667                                 rc = mdt_object_lock(info, child, lhc,
1668                                                      MDS_INODELOCK_LOOKUP,
1669                                                      MDT_CROSS_LOCK);
1670                         }
1671                         repbody->fid1 = *mdt_object_fid(child);
1672                         repbody->valid |= (OBD_MD_FLID | OBD_MD_MDS);
1673                         if (rc != 0)
1674                                 result = rc;
1675                         else
1676                                 result = -EREMOTE;
1677                         GOTO(out_child, result);
1678                 } else {
1679                         if (mdt_object_exists(child)) {
1680                                 /* We have to get attr & LOV EA & HSM for this
1681                                  * object */
1682                                 ma->ma_need |= MA_HSM;
1683                                 result = mdt_attr_get_complex(info, child, ma);
1684                         } else {
1685                                 /*object non-exist!!!*/
1686                                 LBUG();
1687                         }
1688                 }
1689         }
1690
1691         LASSERT(!lustre_handle_is_used(&lhc->mlh_reg_lh));
1692
1693         /* get openlock if this is not replay and if a client requested it */
1694         if (!req_is_replay(req)) {
1695                 rc = mdt_object_open_lock(info, child, lhc, &ibits);
1696                 if (rc != 0)
1697                         GOTO(out_child, result = rc);
1698                 else if (create_flags & MDS_OPEN_LOCK)
1699                         mdt_set_disposition(info, ldlm_rep, DISP_OPEN_LOCK);
1700         }
1701
1702         /* Try to open it now. */
1703         rc = mdt_finish_open(info, parent, child, create_flags,
1704                              created, ldlm_rep);
1705         if (rc) {
1706                 result = rc;
1707                 /* openlock will be released if mdt_finish_open failed */
1708                 mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_LOCK);
1709                 if (created) {
1710                         ma->ma_need = 0;
1711                         ma->ma_valid = 0;
1712                         ma->ma_cookie_size = 0;
1713                         rc = mdo_unlink(info->mti_env,
1714                                         mdt_object_child(parent),
1715                                         mdt_object_child(child),
1716                                         lname,
1717                                         &info->mti_attr, 0);
1718                         if (rc != 0)
1719                                 CERROR("%s: "DFID" cleanup of open: rc = %d\n",
1720                                        mdt_obd_name(info->mti_mdt),
1721                                        PFID(mdt_object_fid(child)), rc);
1722                         mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
1723                 }
1724         }
1725         EXIT;
1726 out_child:
1727         mdt_object_open_unlock(info, child, lhc, ibits, result);
1728         mdt_object_put(info->mti_env, child);
1729 out_parent:
1730         mdt_object_unlock_put(info, parent, lh, result || !created);
1731 out:
1732         if (result)
1733                 lustre_msg_set_transno(req->rq_repmsg, 0);
1734         return result;
1735 }
1736
1737 #define MFD_CLOSED(mode) (((mode) & ~(MDS_FMODE_EPOCH | MDS_FMODE_SOM | \
1738                                       MDS_FMODE_TRUNC)) == MDS_FMODE_CLOSED)
1739
1740 static int mdt_mfd_closed(struct mdt_file_data *mfd)
1741 {
1742         return ((mfd == NULL) || MFD_CLOSED(mfd->mfd_mode));
1743 }
1744
1745 int mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd)
1746 {
1747         struct mdt_object *o = mfd->mfd_object;
1748         struct md_object *next = mdt_object_child(o);
1749         struct md_attr *ma = &info->mti_attr;
1750         int ret = MDT_IOEPOCH_CLOSED;
1751         int rc = 0;
1752         int mode;
1753         ENTRY;
1754
1755         mode = mfd->mfd_mode;
1756
1757         if ((mode & FMODE_WRITE) || (mode & MDS_FMODE_TRUNC)) {
1758                 mdt_write_put(o);
1759                 ret = mdt_ioepoch_close(info, o);
1760         } else if (mode & MDS_FMODE_EXEC) {
1761                 mdt_write_allow(o);
1762         } else if (mode & MDS_FMODE_EPOCH) {
1763                 ret = mdt_ioepoch_close(info, o);
1764         } else if (mode & MDS_FMODE_SOM) {
1765                 ret = mdt_som_au_close(info, o);
1766         }
1767
1768         /* Update atime on close only. */
1769         if ((mode & MDS_FMODE_EXEC || mode & FMODE_READ || mode & FMODE_WRITE)
1770             && (ma->ma_valid & MA_INODE) && (ma->ma_attr.la_valid & LA_ATIME)) {
1771                 /* Set the atime only. */
1772                 ma->ma_valid = MA_INODE;
1773                 ma->ma_attr.la_valid = LA_ATIME;
1774                 rc = mo_attr_set(info->mti_env, next, ma);
1775         }
1776
1777         /* If file data is modified, add the dirty flag.
1778          *
1779          * If MDS_CLOSE_CLEANUP is set, this file is being closed due to an
1780          * eviction, file could have been modified and now dirty
1781          * regarding to HSM archive, check this!
1782          * The logic here is to mark a file dirty if there's a chance it was
1783          * dirtied before the client was evicted, so that we don't have to wait
1784          * for a release attempt before finding out the file was actually dirty
1785          * and fail the release. Aggressively marking it dirty here will cause
1786          * the policy engine to attempt to re-archive it; when rearchiving, we
1787          * can compare the current version to the LMA data_version and make the
1788          * archive request into a noop if it's not actually dirty.
1789          */
1790         if ((ma->ma_attr_flags & MDS_DATA_MODIFIED) ||
1791             ((ma->ma_attr_flags & MDS_CLOSE_CLEANUP) &&
1792              (mode & (FMODE_WRITE|MDS_FMODE_TRUNC))))
1793                 rc = mdt_add_dirty_flag(info, o, ma);
1794
1795         ma->ma_need |= MA_INODE;
1796         ma->ma_valid &= ~MA_INODE;
1797
1798         if (!MFD_CLOSED(mode))
1799                 rc = mo_close(info->mti_env, next, ma, mode);
1800
1801         if (ret == MDT_IOEPOCH_GETATTR || ret == MDT_IOEPOCH_OPENED) {
1802                 struct mdt_export_data *med;
1803
1804                 /* The IOepoch is still opened or SOM update is needed.
1805                  * Put mfd back into the list. */
1806                 LASSERT(mdt_conn_flags(info) & OBD_CONNECT_SOM);
1807                 mdt_mfd_set_mode(mfd, ret == MDT_IOEPOCH_OPENED ?
1808                                       MDS_FMODE_EPOCH : MDS_FMODE_SOM);
1809
1810                 LASSERT(mdt_info_req(info));
1811                 med = &mdt_info_req(info)->rq_export->exp_mdt_data;
1812                 spin_lock(&med->med_open_lock);
1813                 cfs_list_add(&mfd->mfd_list, &med->med_open_head);
1814                 class_handle_hash_back(&mfd->mfd_handle);
1815                 spin_unlock(&med->med_open_lock);
1816
1817                 if (ret == MDT_IOEPOCH_OPENED) {
1818                         ret = 0;
1819                 } else {
1820                         ret = -EAGAIN;
1821                         CDEBUG(D_INODE, "Size-on-MDS attribute update is "
1822                                "needed on "DFID"\n", PFID(mdt_object_fid(o)));
1823                 }
1824         } else {
1825                 mdt_mfd_free(mfd);
1826                 mdt_object_put(info->mti_env, o);
1827         }
1828
1829         RETURN(rc ? rc : ret);
1830 }
1831
1832 int mdt_close(struct mdt_thread_info *info)
1833 {
1834         struct mdt_export_data *med;
1835         struct mdt_file_data   *mfd;
1836         struct mdt_object      *o;
1837         struct md_attr         *ma = &info->mti_attr;
1838         struct mdt_body        *repbody = NULL;
1839         struct ptlrpc_request  *req = mdt_info_req(info);
1840         int rc, ret = 0;
1841         ENTRY;
1842
1843         mdt_counter_incr(req, LPROC_MDT_CLOSE);
1844         /* Close may come with the Size-on-MDS update. Unpack it. */
1845         rc = mdt_close_unpack(info);
1846         if (rc)
1847                 RETURN(err_serious(rc));
1848
1849         LASSERT(info->mti_ioepoch);
1850
1851         req_capsule_set_size(info->mti_pill, &RMF_MDT_MD, RCL_SERVER,
1852                              info->mti_mdt->mdt_max_mdsize);
1853         req_capsule_set_size(info->mti_pill, &RMF_LOGCOOKIES, RCL_SERVER,
1854                              info->mti_mdt->mdt_max_cookiesize);
1855         rc = req_capsule_server_pack(info->mti_pill);
1856         if (mdt_check_resent(info, mdt_reconstruct_generic, NULL)) {
1857                 mdt_client_compatibility(info);
1858                 if (rc == 0)
1859                         mdt_fix_reply(info);
1860                 mdt_exit_ucred(info);
1861                 RETURN(lustre_msg_get_status(req->rq_repmsg));
1862         }
1863
1864         /* Continue to close handle even if we can not pack reply */
1865         if (rc == 0) {
1866                 repbody = req_capsule_server_get(info->mti_pill,
1867                                                  &RMF_MDT_BODY);
1868                 ma->ma_lmm = req_capsule_server_get(info->mti_pill,
1869                                                     &RMF_MDT_MD);
1870                 ma->ma_lmm_size = req_capsule_get_size(info->mti_pill,
1871                                                        &RMF_MDT_MD,
1872                                                        RCL_SERVER);
1873                 ma->ma_cookie = req_capsule_server_get(info->mti_pill,
1874                                                        &RMF_LOGCOOKIES);
1875                 ma->ma_cookie_size = req_capsule_get_size(info->mti_pill,
1876                                                           &RMF_LOGCOOKIES,
1877                                                           RCL_SERVER);
1878                 ma->ma_need = MA_INODE | MA_LOV | MA_COOKIE;
1879                 repbody->eadatasize = 0;
1880                 repbody->aclsize = 0;
1881         } else {
1882                 rc = err_serious(rc);
1883         }
1884
1885         med = &req->rq_export->exp_mdt_data;
1886         spin_lock(&med->med_open_lock);
1887         mfd = mdt_handle2mfd(med, &info->mti_ioepoch->handle,
1888                              req_is_replay(req));
1889         if (mdt_mfd_closed(mfd)) {
1890                 spin_unlock(&med->med_open_lock);
1891                 CDEBUG(D_INODE, "no handle for file close: fid = "DFID
1892                        ": cookie = "LPX64"\n", PFID(info->mti_rr.rr_fid1),
1893                        info->mti_ioepoch->handle.cookie);
1894                 /** not serious error since bug 3633 */
1895                 rc = -ESTALE;
1896         } else {
1897                 class_handle_unhash(&mfd->mfd_handle);
1898                 cfs_list_del_init(&mfd->mfd_list);
1899                 spin_unlock(&med->med_open_lock);
1900
1901                 /* Do not lose object before last unlink. */
1902                 o = mfd->mfd_object;
1903                 mdt_object_get(info->mti_env, o);
1904                 ret = mdt_mfd_close(info, mfd);
1905                 if (repbody != NULL)
1906                         rc = mdt_handle_last_unlink(info, o, ma);
1907                 mdt_empty_transno(info, rc);
1908                 mdt_object_put(info->mti_env, o);
1909         }
1910         if (repbody != NULL) {
1911                 mdt_client_compatibility(info);
1912                 rc = mdt_fix_reply(info);
1913         }
1914
1915         mdt_exit_ucred(info);
1916         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CLOSE_PACK))
1917                 RETURN(err_serious(-ENOMEM));
1918
1919         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_CLOSE_NET_REP,
1920                                  OBD_FAIL_MDS_CLOSE_NET_REP))
1921                 info->mti_fail_id = OBD_FAIL_MDS_CLOSE_NET_REP;
1922         RETURN(rc ? rc : ret);
1923 }
1924
1925 /**
1926  * DONE_WRITING rpc handler.
1927  *
1928  * As mfd is not kept after replayed CLOSE (see mdt_ioepoch_close_on_replay()),
1929  * only those DONE_WRITING rpc will be replayed which really wrote smth on disk,
1930  * and got a trasid. Waiting for such DONE_WRITING is not reliable, so just
1931  * skip attributes and reconstruct the reply here.
1932  */
1933 int mdt_done_writing(struct mdt_thread_info *info)
1934 {
1935         struct ptlrpc_request   *req = mdt_info_req(info);
1936         struct mdt_body         *repbody = NULL;
1937         struct mdt_export_data  *med;
1938         struct mdt_file_data    *mfd;
1939         int rc;
1940         ENTRY;
1941
1942         rc = req_capsule_server_pack(info->mti_pill);
1943         if (rc)
1944                 RETURN(err_serious(rc));
1945
1946         repbody = req_capsule_server_get(info->mti_pill,
1947                                          &RMF_MDT_BODY);
1948         repbody->eadatasize = 0;
1949         repbody->aclsize = 0;
1950
1951         /* Done Writing may come with the Size-on-MDS update. Unpack it. */
1952         rc = mdt_close_unpack(info);
1953         if (rc)
1954                 RETURN(err_serious(rc));
1955
1956         if (mdt_check_resent(info, mdt_reconstruct_generic, NULL)) {
1957                 mdt_exit_ucred(info);
1958                 RETURN(lustre_msg_get_status(req->rq_repmsg));
1959         }
1960
1961         med = &info->mti_exp->exp_mdt_data;
1962         spin_lock(&med->med_open_lock);
1963         mfd = mdt_handle2mfd(med, &info->mti_ioepoch->handle,
1964                              req_is_replay(req));
1965         if (mfd == NULL) {
1966                 spin_unlock(&med->med_open_lock);
1967                 CDEBUG(D_INODE, "no handle for done write: fid = "DFID
1968                        ": cookie = "LPX64" ioepoch = "LPU64"\n",
1969                        PFID(info->mti_rr.rr_fid1),
1970                        info->mti_ioepoch->handle.cookie,
1971                        info->mti_ioepoch->ioepoch);
1972                 /* If this is a replay, reconstruct the transno. */
1973                 if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
1974                         rc = info->mti_ioepoch->flags & MF_SOM_AU ?
1975                              -EAGAIN : 0;
1976                         mdt_empty_transno(info, rc);
1977                 } else
1978                         rc = -ESTALE;
1979                 GOTO(error_ucred, rc);
1980         }
1981
1982         LASSERT(mfd->mfd_mode == MDS_FMODE_EPOCH ||
1983                 mfd->mfd_mode == MDS_FMODE_TRUNC);
1984         class_handle_unhash(&mfd->mfd_handle);
1985         cfs_list_del_init(&mfd->mfd_list);
1986         spin_unlock(&med->med_open_lock);
1987
1988         /* Set EPOCH CLOSE flag if not set by client. */
1989         info->mti_ioepoch->flags |= MF_EPOCH_CLOSE;
1990         info->mti_attr.ma_valid = 0;
1991
1992         info->mti_attr.ma_lmm_size = info->mti_mdt->mdt_max_mdsize;
1993         OBD_ALLOC_LARGE(info->mti_attr.ma_lmm, info->mti_mdt->mdt_max_mdsize);
1994         if (info->mti_attr.ma_lmm == NULL)
1995                 GOTO(error_ucred, rc = -ENOMEM);
1996
1997         rc = mdt_mfd_close(info, mfd);
1998
1999         OBD_FREE_LARGE(info->mti_attr.ma_lmm, info->mti_mdt->mdt_max_mdsize);
2000         mdt_empty_transno(info, rc);
2001 error_ucred:
2002         mdt_exit_ucred(info);
2003         RETURN(rc);
2004 }