Whamcloud - gitweb
LU-2919 hsm: Implementation of exclusive open
[fs/lustre-release.git] / lustre / mdt / mdt_open.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2013, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/mdt/mdt_open.c
37  *
38  * Lustre Metadata Target (mdt) open/close file handling
39  *
40  * Author: Huang Hua <huanghua@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_MDS
44
45 #include <lustre_acl.h>
46 #include <lustre_mds.h>
47 #include "mdt_internal.h"
48
49 /* we do nothing because we do not have refcount now */
50 static void mdt_mfd_get(void *mfdp)
51 {
52 }
53
54 static struct portals_handle_ops mfd_handle_ops = {
55         .hop_addref = mdt_mfd_get,
56         .hop_free   = NULL,
57 };
58
59 /* Create a new mdt_file_data struct, initialize it,
60  * and insert it to global hash table */
61 struct mdt_file_data *mdt_mfd_new(const struct mdt_export_data *med)
62 {
63         struct mdt_file_data *mfd;
64         ENTRY;
65
66         OBD_ALLOC_PTR(mfd);
67         if (mfd != NULL) {
68                 CFS_INIT_LIST_HEAD(&mfd->mfd_handle.h_link);
69                 mfd->mfd_handle.h_owner = med;
70                 CFS_INIT_LIST_HEAD(&mfd->mfd_list);
71                 class_handle_hash(&mfd->mfd_handle, &mfd_handle_ops);
72         }
73
74         RETURN(mfd);
75 }
76
77 /*
78  * Find the mfd pointed to by handle in global hash table.
79  * In case of replay the handle is obsoleted
80  * but mfd can be found in mfd list by that handle
81  */
82 struct mdt_file_data *mdt_handle2mfd(struct mdt_export_data *med,
83                                      const struct lustre_handle *handle,
84                                      bool is_replay_or_resent)
85 {
86         struct mdt_file_data   *mfd;
87         ENTRY;
88
89         LASSERT(handle != NULL);
90         mfd = class_handle2object(handle->cookie, med);
91         /* during dw/setattr replay the mfd can be found by old handle */
92         if (mfd == NULL && is_replay_or_resent) {
93                 cfs_list_for_each_entry(mfd, &med->med_open_head, mfd_list) {
94                         if (mfd->mfd_old_handle.cookie == handle->cookie)
95                                 RETURN(mfd);
96                 }
97                 mfd = NULL;
98         }
99
100         RETURN(mfd);
101 }
102
103 /* free mfd */
104 void mdt_mfd_free(struct mdt_file_data *mfd)
105 {
106         LASSERT(cfs_list_empty(&mfd->mfd_list));
107         OBD_FREE_RCU(mfd, sizeof *mfd, &mfd->mfd_handle);
108 }
109
110 static int mdt_create_data(struct mdt_thread_info *info,
111                            struct mdt_object *p, struct mdt_object *o)
112 {
113         struct md_op_spec     *spec = &info->mti_spec;
114         struct md_attr        *ma   = &info->mti_attr;
115         int                    rc   = 0;
116         ENTRY;
117
118         if (!md_should_create(spec->sp_cr_flags))
119                 RETURN(0);
120
121         ma->ma_need = MA_INODE | MA_LOV;
122         ma->ma_valid = 0;
123         mutex_lock(&o->mot_lov_mutex);
124         if (!(o->mot_flags & MOF_LOV_CREATED)) {
125                 if (p != NULL && (fid_is_obf(mdt_object_fid(p)) ||
126                                   fid_is_dot_lustre(mdt_object_fid(p))))
127                         GOTO(unlock, rc = -EPERM);
128
129                 rc = mdo_create_data(info->mti_env,
130                                      p ? mdt_object_child(p) : NULL,
131                                      mdt_object_child(o), spec, ma);
132                 if (rc == 0)
133                         rc = mdt_attr_get_complex(info, o, ma);
134
135                 if (rc == 0 && ma->ma_valid & MA_LOV)
136                         o->mot_flags |= MOF_LOV_CREATED;
137         }
138 unlock:
139         mutex_unlock(&o->mot_lov_mutex);
140         RETURN(rc);
141 }
142
143 static int mdt_ioepoch_opened(struct mdt_object *mo)
144 {
145         return mo->mot_ioepoch_count;
146 }
147
148 int mdt_object_is_som_enabled(struct mdt_object *mo)
149 {
150         return !mo->mot_ioepoch;
151 }
152
153 /**
154  * Re-enable Size-on-MDS.
155  * Call under ->mot_ioepoch_mutex.
156  */
157 static void mdt_object_som_enable(struct mdt_object *mo, __u64 ioepoch)
158 {
159         if (ioepoch == mo->mot_ioepoch) {
160                 LASSERT(!mdt_ioepoch_opened(mo));
161                 mo->mot_ioepoch = 0;
162                 mo->mot_flags = 0;
163         }
164 }
165
166 /**
167  * Open the IOEpoch. It is allowed if @writecount is not negative.
168  * The epoch and writecount handling is performed under the mot_ioepoch_mutex.
169  */
170 int mdt_ioepoch_open(struct mdt_thread_info *info, struct mdt_object *o,
171                      int created)
172 {
173         struct mdt_device *mdt = info->mti_mdt;
174         int cancel = 0;
175         int rc = 0;
176         ENTRY;
177
178         if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) ||
179             !S_ISREG(lu_object_attr(&o->mot_obj)))
180                 RETURN(0);
181
182         mutex_lock(&o->mot_ioepoch_mutex);
183         if (mdt_ioepoch_opened(o)) {
184                 /* Epoch continues even if there is no writers yet. */
185                 CDEBUG(D_INODE, "continue epoch "LPU64" for "DFID"\n",
186                        o->mot_ioepoch, PFID(mdt_object_fid(o)));
187         } else {
188                 /* XXX: ->mdt_ioepoch is not initialized at the mount */
189                 spin_lock(&mdt->mdt_ioepoch_lock);
190                 if (mdt->mdt_ioepoch < info->mti_replayepoch)
191                         mdt->mdt_ioepoch = info->mti_replayepoch;
192
193                 if (info->mti_replayepoch)
194                         o->mot_ioepoch = info->mti_replayepoch;
195                 else if (++mdt->mdt_ioepoch == IOEPOCH_INVAL)
196                         o->mot_ioepoch = ++mdt->mdt_ioepoch;
197                 else
198                         o->mot_ioepoch = mdt->mdt_ioepoch;
199
200                 spin_unlock(&mdt->mdt_ioepoch_lock);
201
202                 CDEBUG(D_INODE, "starting epoch "LPU64" for "DFID"\n",
203                        o->mot_ioepoch, PFID(mdt_object_fid(o)));
204                 if (created)
205                         o->mot_flags |= MOF_SOM_CREATED;
206                 cancel = 1;
207         }
208         o->mot_ioepoch_count++;
209         mutex_unlock(&o->mot_ioepoch_mutex);
210
211         /* Cancel Size-on-MDS attributes cached on clients for the open case.
212          * In the truncate case, see mdt_reint_setattr(). */
213         if (cancel && (info->mti_rr.rr_fid1 != NULL)) {
214                 struct mdt_lock_handle  *lh = &info->mti_lh[MDT_LH_CHILD];
215                 mdt_lock_reg_init(lh, LCK_EX);
216                 rc = mdt_object_lock(info, o, lh, MDS_INODELOCK_UPDATE,
217                                      MDT_LOCAL_LOCK);
218                 if (rc == 0)
219                         mdt_object_unlock(info, o, lh, 1);
220         }
221         RETURN(rc);
222 }
223
224 /**
225  * Update SOM on-disk attributes.
226  * If enabling, write update inodes and lustre-ea with the proper IOEpoch,
227  * mountid and attributes. If disabling, clean SOM xattr.
228  * Call under ->mot_ioepoch_mutex.
229  */
230 static int mdt_som_attr_set(struct mdt_thread_info *info,
231                             struct mdt_object *obj, __u64 ioepoch, bool enable)
232 {
233         struct md_object        *next = mdt_object_child(obj);
234         int                      rc;
235         ENTRY;
236
237         CDEBUG(D_INODE, "Size-on-MDS attribute %s for epoch "LPU64
238                " on "DFID".\n", enable ? "update" : "disabling",
239                ioepoch, PFID(mdt_object_fid(obj)));
240
241         if (enable) {
242                 struct lu_buf           *buf = &info->mti_buf;
243                 struct som_attrs        *attrs;
244                 struct md_attr          *ma = &info->mti_attr;
245                 struct lu_attr          *la = &ma->ma_attr;
246                 struct obd_device       *obd = info->mti_mdt->mdt_lut.lut_obd;
247
248                 attrs = (struct som_attrs *)info->mti_xattr_buf;
249                 CLASSERT(sizeof(info->mti_xattr_buf) >= sizeof(*attrs));
250
251                 /* pack SOM attributes */
252                 memset(attrs, 0, sizeof(*attrs));
253                 attrs->som_ioepoch = ioepoch;
254                 attrs->som_mountid = obd->u.obt.obt_mount_count;
255                 if ((la->la_valid & LA_SIZE) != 0)
256                         attrs->som_size = la->la_size;
257                 if ((la->la_valid & LA_BLOCKS) != 0)
258                         attrs->som_blocks = la->la_blocks;
259                 lustre_som_swab(attrs);
260
261                 /* update SOM attributes */
262                 buf->lb_buf = attrs;
263                 buf->lb_len = sizeof(*attrs);
264                 rc = mo_xattr_set(info->mti_env, next, buf, XATTR_NAME_SOM, 0);
265         } else {
266                 /* delete SOM attributes */
267                 rc = mo_xattr_del(info->mti_env, next, XATTR_NAME_SOM);
268         }
269
270         RETURN(rc);
271 }
272
273 /** Perform the eviction specific actions on ioepoch close. */
274 static inline int mdt_ioepoch_close_on_eviction(struct mdt_thread_info *info,
275                                                 struct mdt_object *o)
276 {
277         int rc = 0;
278
279         mutex_lock(&o->mot_ioepoch_mutex);
280         CDEBUG(D_INODE, "Eviction. Closing IOepoch "LPU64" on "DFID". "
281                "Count %d\n", o->mot_ioepoch, PFID(mdt_object_fid(o)),
282                o->mot_ioepoch_count);
283         o->mot_ioepoch_count--;
284
285         /* If eviction occured set MOF_SOM_RECOV,
286          * if no other epoch holders, disable SOM on disk. */
287         o->mot_flags |= MOF_SOM_CHANGE | MOF_SOM_RECOV;
288         if (!mdt_ioepoch_opened(o)) {
289                 rc = mdt_som_attr_set(info, o, o->mot_ioepoch, MDT_SOM_DISABLE);
290                 mdt_object_som_enable(o, o->mot_ioepoch);
291         }
292         mutex_unlock(&o->mot_ioepoch_mutex);
293         RETURN(rc);
294 }
295
296 /**
297  * Perform the replay specific actions on ioepoch close.
298  * Skip SOM attribute update if obtained and just forget about the inode state
299  * for the last ioepoch holder. The SOM cache is invalidated on MDS failure.
300  */
301 static inline int mdt_ioepoch_close_on_replay(struct mdt_thread_info *info,
302                                               struct mdt_object *o)
303 {
304         int rc = MDT_IOEPOCH_CLOSED;
305         ENTRY;
306
307         mutex_lock(&o->mot_ioepoch_mutex);
308         CDEBUG(D_INODE, "Replay. Closing epoch "LPU64" on "DFID". Count %d\n",
309                o->mot_ioepoch, PFID(mdt_object_fid(o)), o->mot_ioepoch_count);
310         o->mot_ioepoch_count--;
311
312         /* Get an info from the replayed request if client is supposed
313          * to send an Attibute Update, reconstruct @rc if so */
314         if (info->mti_ioepoch->flags & MF_SOM_AU)
315                 rc = MDT_IOEPOCH_GETATTR;
316
317         if (!mdt_ioepoch_opened(o))
318                 mdt_object_som_enable(o, info->mti_ioepoch->ioepoch);
319         mutex_unlock(&o->mot_ioepoch_mutex);
320
321         RETURN(rc);
322 }
323
324 /**
325  * Regular file IOepoch close.
326  * Closes the ioepoch, checks the object state, apply obtained attributes and
327  * re-enable SOM on the object, if possible. Also checks if the recovery is
328  * needed and packs OBD_MD_FLGETATTRLOCK flag into the reply to force the client
329  * to obtain SOM attributes under the server-side OST locks.
330  *
331  * Return value:
332  * MDT_IOEPOCH_CLOSED if ioepoch is closed.
333  * MDT_IOEPOCH_GETATTR if ioepoch is closed but another SOM update is needed.
334  */
335 static inline int mdt_ioepoch_close_reg(struct mdt_thread_info *info,
336                                         struct mdt_object *o)
337 {
338         struct md_attr *tmp_ma;
339         struct lu_attr *la;
340         int achange, opened;
341         int recovery = 0;
342         int rc = 0, ret = MDT_IOEPOCH_CLOSED;
343         ENTRY;
344
345         la = &info->mti_attr.ma_attr;
346         achange = (info->mti_ioepoch->flags & MF_SOM_CHANGE);
347
348         mutex_lock(&o->mot_ioepoch_mutex);
349         o->mot_ioepoch_count--;
350
351         tmp_ma = &info->mti_u.som.attr;
352         tmp_ma->ma_lmm = info->mti_attr.ma_lmm;
353         tmp_ma->ma_lmm_size = info->mti_attr.ma_lmm_size;
354         tmp_ma->ma_som = &info->mti_u.som.data;
355         tmp_ma->ma_need = MA_INODE | MA_LOV | MA_SOM;
356         tmp_ma->ma_valid = 0;
357         rc = mdt_attr_get_complex(info, o, tmp_ma);
358         if (rc)
359                 GOTO(error_up, rc);
360
361         /* Check the on-disk SOM state. */
362         if (o->mot_flags & MOF_SOM_RECOV)
363                 recovery = 1;
364         else if (!(o->mot_flags & MOF_SOM_CREATED) &&
365                  !(tmp_ma->ma_valid & MA_SOM))
366                 recovery = 1;
367
368         CDEBUG(D_INODE, "Closing epoch "LPU64" on "DFID". Count %d\n",
369                o->mot_ioepoch, PFID(mdt_object_fid(o)), o->mot_ioepoch_count);
370
371         opened = mdt_ioepoch_opened(o);
372         /**
373          * If IOEpoch is not opened, check if a Size-on-MDS update is needed.
374          * Skip the check for file with no LOV  or for unlink files.
375          */
376         if (!opened && tmp_ma->ma_valid & MA_LOV &&
377             !(tmp_ma->ma_valid & MA_INODE && tmp_ma->ma_attr.la_nlink == 0)) {
378                 if (recovery)
379                         /* If some previous writer was evicted, re-ask the
380                          * client for attributes. Even if attributes are
381                          * provided, we cannot believe in them.
382                          * Another use case is that there is no SOM cache on
383                          * disk -- first access with SOM or there was an MDS
384                          * failure. */
385                         ret = MDT_IOEPOCH_GETATTR;
386                 else if (o->mot_flags & MOF_SOM_CHANGE)
387                         /* Some previous writer changed the attribute.
388                          * Do not believe to the current Size-on-MDS
389                          * update, re-ask client. */
390                         ret = MDT_IOEPOCH_GETATTR;
391                 else if (!(la->la_valid & LA_SIZE) && achange)
392                         /* Attributes were changed by the last writer
393                          * only but no Size-on-MDS update is received.*/
394                         ret = MDT_IOEPOCH_GETATTR;
395         }
396
397         if (achange || ret == MDT_IOEPOCH_GETATTR)
398                 o->mot_flags |= MOF_SOM_CHANGE;
399
400         /* If epoch ends and relable SOM attributes are obtained, update them.
401          * Create SOM ea for new files even if there is no attributes obtained
402          * (0-length file). */
403         if (ret == MDT_IOEPOCH_CLOSED && !opened) {
404                 if (achange || o->mot_flags & MOF_SOM_CREATED) {
405                         LASSERT(achange || !(la->la_valid & LA_SIZE));
406                         rc = mdt_som_attr_set(info, o, o->mot_ioepoch,
407                                               MDT_SOM_ENABLE);
408                         /* Avoid the following setattrs of these attributes,
409                          * e.g. for atime update. */
410                         info->mti_attr.ma_valid = 0;
411                 }
412                 mdt_object_som_enable(o, o->mot_ioepoch);
413         }
414
415         mutex_unlock(&o->mot_ioepoch_mutex);
416         /* If recovery is needed, tell the client to perform GETATTR under
417          * the lock. */
418         if (ret == MDT_IOEPOCH_GETATTR && recovery) {
419                 struct mdt_body *rep;
420                 rep = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
421                 rep->valid |= OBD_MD_FLGETATTRLOCK;
422         }
423
424         RETURN(rc ? : ret);
425
426 error_up:
427         mutex_unlock(&o->mot_ioepoch_mutex);
428         return rc;
429 }
430
431 /**
432  * Close IOEpoch (opened file or MDS_FMODE_EPOCH state). It happens if:
433  * - a client closes the IOEpoch;
434  * - a client eviction occured.
435  * Return values:
436  * MDT_IOEPOCH_OPENED if the client does not close IOEpoch.
437  * MDT_IOEPOCH_CLOSED if the client closes IOEpoch.
438  * MDT_IOEPOCH_GETATTR if the client closes IOEpoch but another SOM attribute
439  * update is needed.
440  */
441 static int mdt_ioepoch_close(struct mdt_thread_info *info, struct mdt_object *o)
442 {
443         struct ptlrpc_request *req = mdt_info_req(info);
444         ENTRY;
445
446         if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) ||
447             !S_ISREG(lu_object_attr(&o->mot_obj)))
448                 RETURN(0);
449
450         LASSERT(o->mot_ioepoch_count);
451         LASSERT(info->mti_ioepoch == NULL ||
452                 info->mti_ioepoch->ioepoch == o->mot_ioepoch);
453
454         /* IOEpoch is closed only if client tells about it or eviction occures.
455          * In the replay case, always close the epoch. */
456         if (req == NULL)
457                 RETURN(mdt_ioepoch_close_on_eviction(info, o));
458         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
459                 RETURN(mdt_ioepoch_close_on_replay(info, o));
460         if (info->mti_ioepoch->flags & MF_EPOCH_CLOSE)
461                 RETURN(mdt_ioepoch_close_reg(info, o));
462         /* IO epoch is not closed. */
463         RETURN(MDT_IOEPOCH_OPENED);
464 }
465
466 /**
467  * Close MDS_FMODE_SOM state, when IOEpoch is already closed and we are waiting
468  * for attribute update. It happens if:
469  * - SOM Attribute Update is obtained;
470  * - the client failed to obtain it and informs MDS about it;
471  * - a client eviction occured.
472  * Apply obtained attributes for the 1st case, wipe out the on-disk SOM
473  * cache otherwise.
474  */
475 int mdt_som_au_close(struct mdt_thread_info *info, struct mdt_object *o)
476 {
477         struct ptlrpc_request   *req = mdt_info_req(info);
478         __u64                    ioepoch = 0;
479         int                      act = MDT_SOM_ENABLE;
480         int                      rc = 0;
481         ENTRY;
482
483         LASSERT(!req || info->mti_ioepoch);
484         if (!(mdt_conn_flags(info) & OBD_CONNECT_SOM) ||
485             !S_ISREG(lu_object_attr(&o->mot_obj)))
486                 RETURN(0);
487
488         /* No size whereas MF_SOM_CHANGE is set means client failed to
489          * obtain ost attributes, drop the SOM cache on disk if so. */
490         if (!req ||
491             (info->mti_ioepoch &&
492              info->mti_ioepoch->flags & MF_SOM_CHANGE &&
493              !(info->mti_attr.ma_attr.la_valid & LA_SIZE)))
494                 act = MDT_SOM_DISABLE;
495
496         mutex_lock(&o->mot_ioepoch_mutex);
497         /* Mark the object it is the recovery state if we failed to obtain
498          * SOM attributes. */
499         if (act == MDT_SOM_DISABLE)
500                 o->mot_flags |= MOF_SOM_RECOV;
501
502         if (!mdt_ioepoch_opened(o)) {
503                 ioepoch =  info->mti_ioepoch ?
504                         info->mti_ioepoch->ioepoch : o->mot_ioepoch;
505
506                 if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY))
507                         rc = mdt_som_attr_set(info, o, ioepoch, act);
508                 mdt_object_som_enable(o, ioepoch);
509         }
510         mutex_unlock(&o->mot_ioepoch_mutex);
511         RETURN(rc);
512 }
513
514 int mdt_write_read(struct mdt_object *o)
515 {
516         int rc = 0;
517         ENTRY;
518         mutex_lock(&o->mot_ioepoch_mutex);
519         rc = o->mot_writecount;
520         mutex_unlock(&o->mot_ioepoch_mutex);
521         RETURN(rc);
522 }
523
524 int mdt_write_get(struct mdt_object *o)
525 {
526         int rc = 0;
527         ENTRY;
528         mutex_lock(&o->mot_ioepoch_mutex);
529         if (o->mot_writecount < 0)
530                 rc = -ETXTBSY;
531         else
532                 o->mot_writecount++;
533         mutex_unlock(&o->mot_ioepoch_mutex);
534         RETURN(rc);
535 }
536
537 void mdt_write_put(struct mdt_object *o)
538 {
539         ENTRY;
540         mutex_lock(&o->mot_ioepoch_mutex);
541         o->mot_writecount--;
542         mutex_unlock(&o->mot_ioepoch_mutex);
543         EXIT;
544 }
545
546 static int mdt_write_deny(struct mdt_object *o)
547 {
548         int rc = 0;
549         ENTRY;
550         mutex_lock(&o->mot_ioepoch_mutex);
551         if (o->mot_writecount > 0)
552                 rc = -ETXTBSY;
553         else
554                 o->mot_writecount--;
555         mutex_unlock(&o->mot_ioepoch_mutex);
556         RETURN(rc);
557 }
558
559 static void mdt_write_allow(struct mdt_object *o)
560 {
561         ENTRY;
562         mutex_lock(&o->mot_ioepoch_mutex);
563         o->mot_writecount++;
564         mutex_unlock(&o->mot_ioepoch_mutex);
565         EXIT;
566 }
567
568 /* there can be no real transaction so prepare the fake one */
569 static void mdt_empty_transno(struct mdt_thread_info *info, int rc)
570 {
571         struct mdt_device      *mdt = info->mti_mdt;
572         struct ptlrpc_request  *req = mdt_info_req(info);
573         struct tg_export_data  *ted;
574         struct lsd_client_data *lcd;
575
576         ENTRY;
577         /* transaction has occurred already */
578         if (lustre_msg_get_transno(req->rq_repmsg) != 0)
579                 RETURN_EXIT;
580
581         spin_lock(&mdt->mdt_lut.lut_translock);
582         if (rc != 0) {
583                 if (info->mti_transno != 0) {
584                         struct obd_export *exp = req->rq_export;
585
586                         CERROR("%s: replay trans "LPU64" NID %s: rc = %d\n",
587                                mdt_obd_name(mdt), info->mti_transno,
588                                libcfs_nid2str(exp->exp_connection->c_peer.nid),
589                                rc);
590                         spin_unlock(&mdt->mdt_lut.lut_translock);
591                         RETURN_EXIT;
592                 }
593         } else if (info->mti_transno == 0) {
594                 info->mti_transno = ++mdt->mdt_lut.lut_last_transno;
595         } else {
596                 /* should be replay */
597                 if (info->mti_transno > mdt->mdt_lut.lut_last_transno)
598                         mdt->mdt_lut.lut_last_transno = info->mti_transno;
599         }
600         spin_unlock(&mdt->mdt_lut.lut_translock);
601
602         CDEBUG(D_INODE, "transno = "LPU64", last_committed = "LPU64"\n",
603                info->mti_transno,
604                req->rq_export->exp_obd->obd_last_committed);
605
606         req->rq_transno = info->mti_transno;
607         lustre_msg_set_transno(req->rq_repmsg, info->mti_transno);
608
609         /* update lcd in memory only for resent cases */
610         ted = &req->rq_export->exp_target_data;
611         LASSERT(ted);
612         mutex_lock(&ted->ted_lcd_lock);
613         lcd = ted->ted_lcd;
614         if (info->mti_transno < lcd->lcd_last_transno &&
615             info->mti_transno != 0) {
616                 /* This should happen during replay. Do not update
617                  * last rcvd info if replay req transno < last transno,
618                  * otherwise the following resend(after replay) can not
619                  * be checked correctly by xid */
620                 mutex_unlock(&ted->ted_lcd_lock);
621                 CDEBUG(D_HA, "%s: transno = "LPU64" < last_transno = "LPU64"\n",
622                        mdt_obd_name(mdt), info->mti_transno,
623                        lcd->lcd_last_transno);
624                 RETURN_EXIT;
625         }
626
627         if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE ||
628             lustre_msg_get_opc(req->rq_reqmsg) == MDS_DONE_WRITING) {
629                 if (info->mti_transno != 0)
630                         lcd->lcd_last_close_transno = info->mti_transno;
631                 lcd->lcd_last_close_xid = req->rq_xid;
632                 lcd->lcd_last_close_result = rc;
633         } else {
634                 /* VBR: save versions in last_rcvd for reconstruct. */
635                 __u64 *pre_versions = lustre_msg_get_versions(req->rq_repmsg);
636                 if (pre_versions) {
637                         lcd->lcd_pre_versions[0] = pre_versions[0];
638                         lcd->lcd_pre_versions[1] = pre_versions[1];
639                         lcd->lcd_pre_versions[2] = pre_versions[2];
640                         lcd->lcd_pre_versions[3] = pre_versions[3];
641                 }
642                 if (info->mti_transno != 0)
643                         lcd->lcd_last_transno = info->mti_transno;
644
645                 lcd->lcd_last_xid = req->rq_xid;
646                 lcd->lcd_last_result = rc;
647                 lcd->lcd_last_data = info->mti_opdata;
648         }
649         mutex_unlock(&ted->ted_lcd_lock);
650
651         EXIT;
652 }
653
654 void mdt_mfd_set_mode(struct mdt_file_data *mfd, __u64 mode)
655 {
656         LASSERT(mfd != NULL);
657
658         CDEBUG(D_HA, DFID "Change mfd mode 0x%Lx->0x%Lx\n",
659                PFID(mdt_object_fid(mfd->mfd_object)), mfd->mfd_mode, mode);
660
661         mfd->mfd_mode = mode;
662 }
663
664 static int mdt_mfd_open(struct mdt_thread_info *info, struct mdt_object *p,
665                         struct mdt_object *o, __u64 flags, int created)
666 {
667         struct ptlrpc_request   *req = mdt_info_req(info);
668         struct mdt_export_data  *med = &req->rq_export->exp_mdt_data;
669         struct mdt_file_data    *mfd;
670         struct md_attr          *ma  = &info->mti_attr;
671         struct lu_attr          *la  = &ma->ma_attr;
672         struct mdt_body         *repbody;
673         int                      rc = 0, isdir, isreg;
674         ENTRY;
675
676         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
677
678         isreg = S_ISREG(la->la_mode);
679         isdir = S_ISDIR(la->la_mode);
680         if (isreg && !(ma->ma_valid & MA_LOV)) {
681                 /*
682                  * No EA, check whether it is will set regEA and dirEA since in
683                  * above attr get, these size might be zero, so reset it, to
684                  * retrieve the MD after create obj.
685                  */
686                 ma->ma_lmm_size = req_capsule_get_size(info->mti_pill,
687                                                        &RMF_MDT_MD,
688                                                        RCL_SERVER);
689                 /* in replay case, p == NULL */
690                 rc = mdt_create_data(info, p, o);
691                 if (rc)
692                         RETURN(rc);
693         }
694
695         CDEBUG(D_INODE, "after open, ma_valid bit = "LPX64" lmm_size = %d\n",
696                ma->ma_valid, ma->ma_lmm_size);
697
698         if (ma->ma_valid & MA_LOV) {
699                 LASSERT(ma->ma_lmm_size != 0);
700                 repbody->eadatasize = ma->ma_lmm_size;
701                 if (isdir)
702                         repbody->valid |= OBD_MD_FLDIREA;
703                 else
704                         repbody->valid |= OBD_MD_FLEASIZE;
705         }
706
707         if (flags & FMODE_WRITE) {
708                 rc = mdt_write_get(o);
709                 if (rc == 0) {
710                         mdt_ioepoch_open(info, o, created);
711                         repbody->ioepoch = o->mot_ioepoch;
712                 }
713         } else if (flags & MDS_FMODE_EXEC) {
714                 /* if file is released, we can't deny write because we must
715                  * restore (write) it to access it.*/
716                 if ((ma->ma_valid & MA_HSM) &&
717                     (ma->ma_hsm.mh_flags & HS_RELEASED))
718                         rc = 0;
719                 else
720                         rc = mdt_write_deny(o);
721         }
722         if (rc)
723                 RETURN(rc);
724
725         rc = mo_open(info->mti_env, mdt_object_child(o),
726                      created ? flags | MDS_OPEN_CREATED : flags);
727         if (rc)
728                 GOTO(err_out, rc);
729
730         mfd = mdt_mfd_new(med);
731         if (mfd == NULL)
732                 GOTO(err_out, rc = -ENOMEM);
733
734         /*
735          * Keep a reference on this object for this open, and is
736          * released by mdt_mfd_close().
737          */
738         mdt_object_get(info->mti_env, o);
739         mfd->mfd_object = o;
740         mfd->mfd_xid = req->rq_xid;
741
742         /*
743          * @flags is always not zero. At least it should be FMODE_READ,
744          * FMODE_WRITE or MDS_FMODE_EXEC.
745          */
746         LASSERT(flags != 0);
747
748         /* Open handling. */
749         mdt_mfd_set_mode(mfd, flags);
750
751         atomic_inc(&o->mot_open_count);
752         if (flags & MDS_OPEN_LEASE)
753                 atomic_inc(&o->mot_lease_count);
754
755         /* replay handle */
756         if (req_is_replay(req)) {
757                 struct mdt_file_data *old_mfd;
758                 /* Check wheather old cookie already exist in
759                  * the list, becasue when do recovery, client
760                  * might be disconnected from server, and
761                  * restart replay, so there maybe some orphan
762                  * mfd here, we should remove them */
763                 LASSERT(info->mti_rr.rr_handle != NULL);
764                 old_mfd = mdt_handle2mfd(med, info->mti_rr.rr_handle, true);
765                 if (old_mfd != NULL) {
766                         CDEBUG(D_HA, "delete orphan mfd = %p, fid = "DFID", "
767                                "cookie = "LPX64"\n", mfd,
768                                PFID(mdt_object_fid(mfd->mfd_object)),
769                                info->mti_rr.rr_handle->cookie);
770                         spin_lock(&med->med_open_lock);
771                         class_handle_unhash(&old_mfd->mfd_handle);
772                         cfs_list_del_init(&old_mfd->mfd_list);
773                         spin_unlock(&med->med_open_lock);
774                         /* no attr update for that close */
775                         la->la_valid = 0;
776                         ma->ma_valid |= MA_FLAGS;
777                         ma->ma_attr_flags |= MDS_RECOV_OPEN;
778                         mdt_mfd_close(info, old_mfd);
779                         ma->ma_attr_flags &= ~MDS_RECOV_OPEN;
780                         ma->ma_valid &= ~MA_FLAGS;
781                 }
782
783                 CDEBUG(D_HA, "Store old cookie "LPX64" in new mfd\n",
784                        info->mti_rr.rr_handle->cookie);
785
786                 mfd->mfd_old_handle.cookie = info->mti_rr.rr_handle->cookie;
787         }
788
789         repbody->handle.cookie = mfd->mfd_handle.h_cookie;
790
791         if (req->rq_export->exp_disconnected) {
792                 spin_lock(&med->med_open_lock);
793                 class_handle_unhash(&mfd->mfd_handle);
794                 cfs_list_del_init(&mfd->mfd_list);
795                 spin_unlock(&med->med_open_lock);
796                 mdt_mfd_close(info, mfd);
797         } else {
798                 spin_lock(&med->med_open_lock);
799                 cfs_list_add(&mfd->mfd_list, &med->med_open_head);
800                 spin_unlock(&med->med_open_lock);
801         }
802
803         mdt_empty_transno(info, rc);
804
805         RETURN(rc);
806
807 err_out:
808         if (flags & FMODE_WRITE)
809                         /* XXX We also need to close io epoch here.
810                          * See LU-1220 - green */
811                 mdt_write_put(o);
812         else if (flags & FMODE_EXEC)
813                 mdt_write_allow(o);
814         return rc;
815 }
816
817 int mdt_finish_open(struct mdt_thread_info *info,
818                     struct mdt_object *p, struct mdt_object *o,
819                     __u64 flags, int created, struct ldlm_reply *rep)
820 {
821         struct ptlrpc_request   *req = mdt_info_req(info);
822         struct obd_export       *exp = req->rq_export;
823         struct mdt_export_data  *med = &req->rq_export->exp_mdt_data;
824         struct md_attr          *ma  = &info->mti_attr;
825         struct lu_attr          *la  = &ma->ma_attr;
826         struct mdt_file_data    *mfd;
827         struct mdt_body         *repbody;
828         int                      rc = 0;
829         int                      isreg, isdir, islnk;
830         cfs_list_t              *t;
831         ENTRY;
832
833         LASSERT(ma->ma_valid & MA_INODE);
834
835         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
836
837         isreg = S_ISREG(la->la_mode);
838         isdir = S_ISDIR(la->la_mode);
839         islnk = S_ISLNK(la->la_mode);
840         mdt_pack_attr2body(info, repbody, la, mdt_object_fid(o));
841
842         /* LU-2275, simulate broken behaviour (esp. prevalent in
843          * pre-2.4 servers where a very strange reply is sent on error
844          * that looks like it was actually almost succesful and a failure at the
845          * same time */
846         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_NEGATIVE_POSITIVE)) {
847                 mdt_set_disposition(info, rep, DISP_OPEN_OPEN |
848                                                DISP_LOOKUP_NEG |
849                                                DISP_LOOKUP_POS);
850
851                 if (flags & MDS_OPEN_LOCK)
852                         mdt_set_disposition(info, rep, DISP_OPEN_LOCK);
853
854                 RETURN(-ENOENT);
855         }
856
857         if (exp_connect_rmtclient(exp)) {
858                 void *buf = req_capsule_server_get(info->mti_pill, &RMF_ACL);
859
860                 rc = mdt_pack_remote_perm(info, o, buf);
861                 if (rc) {
862                         repbody->valid &= ~OBD_MD_FLRMTPERM;
863                         repbody->aclsize = 0;
864                 } else {
865                         repbody->valid |= OBD_MD_FLRMTPERM;
866                         repbody->aclsize = sizeof(struct mdt_remote_perm);
867                 }
868         }
869 #ifdef CONFIG_FS_POSIX_ACL
870         else if (exp_connect_flags(exp) & OBD_CONNECT_ACL) {
871                 const struct lu_env *env = info->mti_env;
872                 struct md_object *next = mdt_object_child(o);
873                 struct lu_buf *buf = &info->mti_buf;
874
875                 buf->lb_buf = req_capsule_server_get(info->mti_pill, &RMF_ACL);
876                 buf->lb_len = req_capsule_get_size(info->mti_pill, &RMF_ACL,
877                                                    RCL_SERVER);
878                 if (buf->lb_len > 0) {
879                         rc = mo_xattr_get(env, next, buf,
880                                           XATTR_NAME_ACL_ACCESS);
881                         if (rc < 0) {
882                                 if (rc == -ENODATA) {
883                                         repbody->aclsize = 0;
884                                         repbody->valid |= OBD_MD_FLACL;
885                                         rc = 0;
886                                 } else if (rc == -EOPNOTSUPP) {
887                                         rc = 0;
888                                 } else {
889                                         CERROR("got acl size: %d\n", rc);
890                                 }
891                         } else {
892                                 repbody->aclsize = rc;
893                                 repbody->valid |= OBD_MD_FLACL;
894                                 rc = 0;
895                         }
896                 }
897         }
898 #endif
899
900         if (info->mti_mdt->mdt_opts.mo_mds_capa &&
901             exp_connect_flags(exp) & OBD_CONNECT_MDS_CAPA) {
902                 struct lustre_capa *capa;
903
904                 capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA1);
905                 LASSERT(capa);
906                 capa->lc_opc = CAPA_OPC_MDS_DEFAULT;
907                 rc = mo_capa_get(info->mti_env, mdt_object_child(o), capa, 0);
908                 if (rc)
909                         RETURN(rc);
910                 repbody->valid |= OBD_MD_FLMDSCAPA;
911         }
912
913         if (info->mti_mdt->mdt_opts.mo_oss_capa &&
914             exp_connect_flags(exp) & OBD_CONNECT_OSS_CAPA &&
915             S_ISREG(lu_object_attr(&o->mot_obj))) {
916                 struct lustre_capa *capa;
917
918                 capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA2);
919                 LASSERT(capa);
920                 capa->lc_opc = CAPA_OPC_OSS_DEFAULT | capa_open_opc(flags);
921                 rc = mo_capa_get(info->mti_env, mdt_object_child(o), capa, 0);
922                 if (rc)
923                         RETURN(rc);
924                 repbody->valid |= OBD_MD_FLOSSCAPA;
925         }
926
927         /*
928          * If we are following a symlink, don't open; and do not return open
929          * handle for special nodes as client required.
930          */
931         if (islnk || (!isreg && !isdir &&
932             (exp_connect_flags(req->rq_export) & OBD_CONNECT_NODEVOH))) {
933                 lustre_msg_set_transno(req->rq_repmsg, 0);
934                 RETURN(0);
935         }
936
937         /*
938          * We need to return the existing object's fid back, so it is done here,
939          * after preparing the reply.
940          */
941         if (!created && (flags & MDS_OPEN_EXCL) && (flags & MDS_OPEN_CREAT))
942                 RETURN(-EEXIST);
943
944         /* This can't be done earlier, we need to return reply body */
945         if (isdir) {
946                 if (flags & (MDS_OPEN_CREAT | FMODE_WRITE)) {
947                         /* We are trying to create or write an existing dir. */
948                         RETURN(-EISDIR);
949                 }
950         } else if (flags & MDS_OPEN_DIRECTORY)
951                 RETURN(-ENOTDIR);
952
953         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_OPEN_CREATE,
954                                  OBD_FAIL_LDLM_REPLY | OBD_FAIL_ONCE)) {
955                 RETURN(-EAGAIN);
956         }
957
958         mfd = NULL;
959         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
960                 spin_lock(&med->med_open_lock);
961                 cfs_list_for_each(t, &med->med_open_head) {
962                         mfd = cfs_list_entry(t, struct mdt_file_data, mfd_list);
963                         if (mfd->mfd_xid == req->rq_xid)
964                                 break;
965                         mfd = NULL;
966                 }
967                 spin_unlock(&med->med_open_lock);
968
969                 if (mfd != NULL) {
970                         repbody->handle.cookie = mfd->mfd_handle.h_cookie;
971                         /*set repbody->ea_size for resent case*/
972                         if (ma->ma_valid & MA_LOV) {
973                                 LASSERT(ma->ma_lmm_size != 0);
974                                 repbody->eadatasize = ma->ma_lmm_size;
975                                 if (isdir)
976                                         repbody->valid |= OBD_MD_FLDIREA;
977                                 else
978                                         repbody->valid |= OBD_MD_FLEASIZE;
979                         }
980                         mdt_set_disposition(info, rep, DISP_OPEN_OPEN);
981                         RETURN(0);
982                 }
983         }
984
985         rc = mdt_mfd_open(info, p, o, flags, created);
986         if (!rc)
987                 mdt_set_disposition(info, rep, DISP_OPEN_OPEN);
988
989         RETURN(rc);
990 }
991
992 extern void mdt_req_from_lcd(struct ptlrpc_request *req,
993                              struct lsd_client_data *lcd);
994
995 void mdt_reconstruct_open(struct mdt_thread_info *info,
996                           struct mdt_lock_handle *lhc)
997 {
998         const struct lu_env *env = info->mti_env;
999         struct mdt_device       *mdt  = info->mti_mdt;
1000         struct req_capsule      *pill = info->mti_pill;
1001         struct ptlrpc_request   *req  = mdt_info_req(info);
1002         struct tg_export_data   *ted  = &req->rq_export->exp_target_data;
1003         struct lsd_client_data  *lcd  = ted->ted_lcd;
1004         struct md_attr          *ma   = &info->mti_attr;
1005         struct mdt_reint_record *rr   = &info->mti_rr;
1006         __u32                   flags = info->mti_spec.sp_cr_flags;
1007         struct ldlm_reply       *ldlm_rep;
1008         struct mdt_object       *parent;
1009         struct mdt_object       *child;
1010         struct mdt_body         *repbody;
1011         int                      rc;
1012         ENTRY;
1013
1014         LASSERT(pill->rc_fmt == &RQF_LDLM_INTENT_OPEN);
1015         ldlm_rep = req_capsule_server_get(pill, &RMF_DLM_REP);
1016         repbody = req_capsule_server_get(pill, &RMF_MDT_BODY);
1017
1018         ma->ma_lmm = req_capsule_server_get(pill, &RMF_MDT_MD);
1019         ma->ma_lmm_size = req_capsule_get_size(pill, &RMF_MDT_MD,
1020                                                RCL_SERVER);
1021         ma->ma_need = MA_INODE | MA_HSM;
1022         if (ma->ma_lmm_size > 0)
1023                 ma->ma_need |= MA_LOV;
1024
1025         ma->ma_valid = 0;
1026
1027         mdt_req_from_lcd(req, lcd);
1028         mdt_set_disposition(info, ldlm_rep, lcd->lcd_last_data);
1029
1030         CDEBUG(D_INODE, "This is reconstruct open: disp="LPX64", result=%d\n",
1031                ldlm_rep->lock_policy_res1, req->rq_status);
1032
1033         if (mdt_get_disposition(ldlm_rep, DISP_OPEN_CREATE) &&
1034             req->rq_status != 0)
1035                 /* We did not create successfully, return error to client. */
1036                 GOTO(out, rc = req->rq_status);
1037
1038         if (mdt_get_disposition(ldlm_rep, DISP_OPEN_CREATE)) {
1039                 struct obd_export *exp = req->rq_export;
1040                 /*
1041                  * We failed after creation, but we do not know in which step
1042                  * we failed. So try to check the child object.
1043                  */
1044                 parent = mdt_object_find(env, mdt, rr->rr_fid1);
1045                 if (IS_ERR(parent)) {
1046                         rc = PTR_ERR(parent);
1047                         LCONSOLE_WARN("Parent "DFID" lookup error %d."
1048                                       " Evicting client %s with export %s.\n",
1049                                       PFID(rr->rr_fid1), rc,
1050                                       obd_uuid2str(&exp->exp_client_uuid),
1051                                       obd_export_nid2str(exp));
1052                         mdt_export_evict(exp);
1053                         RETURN_EXIT;
1054                 }
1055                 child = mdt_object_find(env, mdt, rr->rr_fid2);
1056                 if (IS_ERR(child)) {
1057                         rc = PTR_ERR(child);
1058                         LCONSOLE_WARN("Child "DFID" lookup error %d."
1059                                       " Evicting client %s with export %s.\n",
1060                                       PFID(mdt_object_fid(child)), rc,
1061                                       obd_uuid2str(&exp->exp_client_uuid),
1062                                       obd_export_nid2str(exp));
1063                         mdt_object_put(env, parent);
1064                         mdt_export_evict(exp);
1065                         RETURN_EXIT;
1066                 }
1067
1068                 if (unlikely(mdt_object_remote(child))) {
1069                         /* the child object was created on remote server */
1070                         if (!mdt_is_dne_client(exp)) {
1071                                 /* Return -EIO for old client */
1072                                 mdt_object_put(env, parent);
1073                                 mdt_object_put(env, child);
1074                                 GOTO(out, rc = -EIO);
1075                         }
1076                         repbody->fid1 = *rr->rr_fid2;
1077                         repbody->valid |= (OBD_MD_FLID | OBD_MD_MDS);
1078                         rc = 0;
1079                 } else {
1080                         if (mdt_object_exists(child)) {
1081                                 mdt_set_capainfo(info, 1, rr->rr_fid2,
1082                                                  BYPASS_CAPA);
1083                                 rc = mdt_attr_get_complex(info, child, ma);
1084                                 if (rc == 0)
1085                                         rc = mdt_finish_open(info, parent,
1086                                                              child, flags,
1087                                                              1, ldlm_rep);
1088                         } else {
1089                                 /* the child does not exist, we should do
1090                                  * regular open */
1091                                 mdt_object_put(env, parent);
1092                                 mdt_object_put(env, child);
1093                                 GOTO(regular_open, 0);
1094                         }
1095                 }
1096                 mdt_object_put(env, parent);
1097                 mdt_object_put(env, child);
1098                 GOTO(out, rc);
1099         } else {
1100 regular_open:
1101                 /* We did not try to create, so we are a pure open */
1102                 rc = mdt_reint_open(info, lhc);
1103         }
1104
1105         EXIT;
1106 out:
1107         req->rq_status = rc;
1108         lustre_msg_set_status(req->rq_repmsg, req->rq_status);
1109         LASSERT(ergo(rc < 0, lustre_msg_get_transno(req->rq_repmsg) == 0));
1110 }
1111
1112 int mdt_open_by_fid(struct mdt_thread_info* info,
1113                     struct ldlm_reply *rep)
1114 {
1115         __u32                    flags = info->mti_spec.sp_cr_flags;
1116         struct mdt_reint_record *rr = &info->mti_rr;
1117         struct md_attr          *ma = &info->mti_attr;
1118         struct mdt_object       *o;
1119         int                      rc;
1120         ENTRY;
1121
1122         o = mdt_object_find(info->mti_env, info->mti_mdt, rr->rr_fid2);
1123         if (IS_ERR(o))
1124                 RETURN(rc = PTR_ERR(o));
1125
1126         if (unlikely(mdt_object_remote(o))) {
1127                 /* the child object was created on remote server */
1128                 struct mdt_body *repbody;
1129
1130                 mdt_set_disposition(info, rep, (DISP_IT_EXECD |
1131                                                 DISP_LOOKUP_EXECD |
1132                                                 DISP_LOOKUP_POS));
1133                 repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
1134                 repbody->fid1 = *rr->rr_fid2;
1135                 repbody->valid |= (OBD_MD_FLID | OBD_MD_MDS);
1136                 rc = 0;
1137         } else {
1138                 if (mdt_object_exists(o)) {
1139                         mdt_set_disposition(info, rep, (DISP_IT_EXECD |
1140                                                         DISP_LOOKUP_EXECD |
1141                                                         DISP_LOOKUP_POS));
1142
1143                         rc = mdt_attr_get_complex(info, o, ma);
1144                         if (rc == 0)
1145                                 rc = mdt_finish_open(info, NULL, o, flags, 0,
1146                                                      rep);
1147                 } else {
1148                         rc = -ENOENT;
1149                 }
1150         }
1151
1152         mdt_object_put(info->mti_env, o);
1153         RETURN(rc);
1154 }
1155
1156 /* lock object for open */
1157 static int mdt_object_open_lock(struct mdt_thread_info *info,
1158                                 struct mdt_object *obj,
1159                                 struct mdt_lock_handle *lhc,
1160                                 __u64 *ibits)
1161 {
1162         struct md_attr  *ma = &info->mti_attr;
1163         __u64            open_flags = info->mti_spec.sp_cr_flags;
1164         ldlm_mode_t      lm = LCK_CR;
1165         bool             acq_lease = !!(open_flags & MDS_OPEN_LEASE);
1166         bool             try_layout = false;
1167         bool             create_layout = false;
1168         int              rc = 0;
1169         ENTRY;
1170
1171         *ibits = 0;
1172         mdt_lock_handle_init(lhc);
1173
1174         if (req_is_replay(mdt_info_req(info)))
1175                 RETURN(0);
1176
1177         if (S_ISREG(lu_object_attr(&obj->mot_obj))) {
1178                 if (ma->ma_need & MA_LOV && !(ma->ma_valid & MA_LOV) &&
1179                     md_should_create(open_flags))
1180                         create_layout = true;
1181                 if (exp_connect_layout(info->mti_exp) && !create_layout &&
1182                     ma->ma_need & MA_LOV)
1183                         try_layout = true;
1184         }
1185
1186         if (acq_lease) {
1187                 /* lease open, acquire write mode of open sem */
1188                 down_write(&obj->mot_open_sem);
1189
1190                 /* Lease exists and ask for new lease */
1191                 if (atomic_read(&obj->mot_lease_count) > 0) {
1192                         /* only exclusive open is supported, so lease
1193                          * are conflicted to each other */
1194                         GOTO(out, rc = -EBUSY);
1195                 }
1196
1197                 /* Lease must be with open lock */
1198                 if (!(open_flags & MDS_OPEN_LOCK)) {
1199                         CERROR("Request lease for file:"DFID ", but open lock "
1200                                 "is missed, open_flags = "LPO64".\n",
1201                                 PFID(mdt_object_fid(obj)), open_flags);
1202                         GOTO(out, rc = -EPROTO);
1203                 }
1204
1205                 /* XXX: only exclusive open is supported. */
1206                 lm = LCK_EX;
1207                 *ibits = MDS_INODELOCK_OPEN;
1208
1209                 /* never grant LCK_EX layout lock to client */
1210                 try_layout = false;
1211         } else { /* normal open */
1212                 /* normal open holds read mode of open sem */
1213                 down_read(&obj->mot_open_sem);
1214
1215                 if (open_flags & MDS_OPEN_LOCK) {
1216                         if (open_flags & FMODE_WRITE)
1217                                 lm = LCK_CW;
1218                         /* if file is released, we can't deny write because we must
1219                          * restore (write) it to access it. */
1220                         else if ((open_flags & MDS_FMODE_EXEC) &&
1221                                  !((ma->ma_valid & MA_HSM) &&
1222                                    (ma->ma_hsm.mh_flags & HS_RELEASED)))
1223                                 lm = LCK_PR;
1224                         else
1225                                 lm = LCK_CR;
1226
1227                         *ibits = MDS_INODELOCK_LOOKUP | MDS_INODELOCK_OPEN;
1228                 } else if (atomic_read(&obj->mot_lease_count) > 0) {
1229                         if (open_flags & FMODE_WRITE)
1230                                 lm = LCK_CW;
1231                         else
1232                                 lm = LCK_CR;
1233
1234                         /* revoke lease */
1235                         *ibits = MDS_INODELOCK_OPEN;
1236                         try_layout = false;
1237
1238                         lhc = &info->mti_lh[MDT_LH_LOCAL];
1239                 }
1240                 CDEBUG(D_INODE, "normal open:"DFID" lease count: %d, lm: %d\n",
1241                         PFID(mdt_object_fid(obj)),
1242                         atomic_read(&obj->mot_open_count), lm);
1243         }
1244
1245         mdt_lock_reg_init(lhc, lm);
1246
1247         /* one problem to return layout lock on open is that it may result
1248          * in too many layout locks cached on the client side. */
1249         if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_NO_LL_OPEN) && try_layout) {
1250                 /* return lookup lock to validate inode at the client side,
1251                  * this is pretty important otherwise mdt will return layout
1252                  * lock for each open.
1253                  * However this is a double-edged sword because changing
1254                  * permission will revoke huge # of LOOKUP locks. */
1255                 *ibits |= MDS_INODELOCK_LAYOUT | MDS_INODELOCK_LOOKUP;
1256                 if (!mdt_object_lock_try(info, obj, lhc, *ibits,
1257                                          MDT_CROSS_LOCK)) {
1258                         *ibits &= ~(MDS_INODELOCK_LAYOUT|MDS_INODELOCK_LOOKUP);
1259                         if (*ibits != 0)
1260                                 rc = mdt_object_lock(info, obj, lhc, *ibits,
1261                                                 MDT_CROSS_LOCK);
1262                 }
1263         } else if (*ibits != 0) {
1264                 rc = mdt_object_lock(info, obj, lhc, *ibits, MDT_CROSS_LOCK);
1265         }
1266
1267         CDEBUG(D_INODE, "Requested bits lock:"DFID ", ibits = "LPX64
1268                 ", open_flags = "LPO64", try_layout = %d, rc = %d\n",
1269                 PFID(mdt_object_fid(obj)), *ibits, open_flags, try_layout, rc);
1270
1271         /* will change layout, revoke layout locks by enqueuing EX lock. */
1272         if (rc == 0 && create_layout) {
1273                 struct mdt_lock_handle *ll = &info->mti_lh[MDT_LH_LAYOUT];
1274
1275                 CDEBUG(D_INODE, "Will create layout, get EX layout lock:"DFID
1276                         ", open_flags = "LPO64"\n",
1277                         PFID(mdt_object_fid(obj)), open_flags);
1278
1279                 LASSERT(!try_layout);
1280                 mdt_lock_handle_init(ll);
1281                 mdt_lock_reg_init(ll, LCK_EX);
1282                 rc = mdt_object_lock(info, obj, ll, MDS_INODELOCK_LAYOUT,
1283                                         MDT_LOCAL_LOCK);
1284
1285                 OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LL_BLOCK, 2);
1286         }
1287
1288         /* Check if there is any other open handles after acquiring
1289          * open lock. At this point, caching open handles have been revoked
1290          * by open lock.
1291          * XXX: Now only exclusive open is supported. Need to check the
1292          * type of open for generic lease support. */
1293         if (rc == 0 && acq_lease) {
1294                 struct ptlrpc_request *req = mdt_info_req(info);
1295                 struct mdt_export_data *med = &req->rq_export->exp_mdt_data;
1296                 struct mdt_file_data *mfd;
1297                 bool is_replay_or_resent;
1298                 int open_count = 0;
1299
1300                 /* For lease: application can open a file and then apply lease,
1301                  * @handle contains original open handle in that case.
1302                  * In recovery, open REQ will be replayed and the lease REQ may
1303                  * be resent that means the open handle is already stale, so we
1304                  * need to fix it up here by finding new handle. */
1305                 is_replay_or_resent = req_is_replay(req) ||
1306                         lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT;
1307
1308                 /* if the request is _not_ a replay request, rr_handle
1309                  * may be used to hold an openhandle which is issuing the
1310                  * lease request, so that this openhandle doesn't count. */
1311                 mfd = mdt_handle2mfd(med, info->mti_rr.rr_handle,
1312                                      is_replay_or_resent);
1313                 if (mfd != NULL)
1314                         ++open_count;
1315
1316                 CDEBUG(D_INODE, "acq_lease "DFID": openers: %d, want: %d\n",
1317                         PFID(mdt_object_fid(obj)),
1318                         atomic_read(&obj->mot_open_count), open_count);
1319
1320                 if (atomic_read(&obj->mot_open_count) > open_count)
1321                         GOTO(out, rc = -EBUSY);
1322         }
1323         GOTO(out, rc);
1324
1325 out:
1326         RETURN(rc);
1327 }
1328
1329 static void mdt_object_open_unlock(struct mdt_thread_info *info,
1330                                    struct mdt_object *obj,
1331                                    struct mdt_lock_handle *lhc,
1332                                    __u64 ibits, int rc)
1333 {
1334         __u64 open_flags = info->mti_spec.sp_cr_flags;
1335         struct mdt_lock_handle *ll = &info->mti_lh[MDT_LH_LOCAL];
1336         ENTRY;
1337
1338         if (req_is_replay(mdt_info_req(info)))
1339                 RETURN_EXIT;
1340
1341         /* Release local lock - the lock put in MDT_LH_LOCAL will never
1342          * return to client side. */
1343         if (lustre_handle_is_used(&ll->mlh_reg_lh))
1344                 mdt_object_unlock(info, obj, ll, 1);
1345
1346         ll = &info->mti_lh[MDT_LH_LAYOUT];
1347         /* Release local layout lock, layout was created */
1348         if (lustre_handle_is_used(&ll->mlh_reg_lh)) {
1349                 LASSERT(!(ibits & MDS_INODELOCK_LAYOUT));
1350                 mdt_object_unlock(info, obj, ll, 1);
1351         }
1352
1353         if (open_flags & MDS_OPEN_LEASE)
1354                 up_write(&obj->mot_open_sem);
1355         else
1356                 up_read(&obj->mot_open_sem);
1357
1358         /* Cross-ref case, the lock should be returned to the client */
1359         if (ibits == 0 || rc == -EREMOTE)
1360                 RETURN_EXIT;
1361
1362         if (!(open_flags & MDS_OPEN_LOCK) && !(ibits & MDS_INODELOCK_LAYOUT)) {
1363                 /* for the open request, the lock will only return to client
1364                  * if open or layout lock is granted. */
1365                 rc = 1;
1366         }
1367
1368         if (rc != 0) {
1369                 struct ldlm_reply       *ldlm_rep;
1370
1371                 ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
1372                 mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_LOCK);
1373                 mdt_object_unlock(info, obj, lhc, 1);
1374         }
1375         RETURN_EXIT;
1376 }
1377
1378 int mdt_open_by_fid_lock(struct mdt_thread_info *info, struct ldlm_reply *rep,
1379                          struct mdt_lock_handle *lhc)
1380 {
1381         const struct lu_env     *env   = info->mti_env;
1382         struct mdt_device       *mdt   = info->mti_mdt;
1383         __u64                    flags = info->mti_spec.sp_cr_flags;
1384         struct mdt_reint_record *rr    = &info->mti_rr;
1385         struct md_attr          *ma    = &info->mti_attr;
1386         struct mdt_object       *parent= NULL;
1387         struct mdt_object       *o;
1388         int                      rc;
1389         __u64                    ibits = 0;
1390         ENTRY;
1391
1392         if (md_should_create(flags) && !(flags & MDS_OPEN_HAS_EA)) {
1393                 if (!lu_fid_eq(rr->rr_fid1, rr->rr_fid2)) {
1394                         parent = mdt_object_find(env, mdt, rr->rr_fid1);
1395                         if (IS_ERR(parent)) {
1396                                 CDEBUG(D_INODE, "Fail to find parent "DFID
1397                                        " for anonymous created %ld, try to"
1398                                        " use server-side parent.\n",
1399                                        PFID(rr->rr_fid1), PTR_ERR(parent));
1400                                 parent = NULL;
1401                         }
1402                 }
1403                 if (parent == NULL)
1404                         ma->ma_need |= MA_PFID;
1405         }
1406
1407         o = mdt_object_find(env, mdt, rr->rr_fid2);
1408         if (IS_ERR(o))
1409                 RETURN(rc = PTR_ERR(o));
1410
1411         if (mdt_object_remote(o)) {
1412                 CDEBUG(D_INFO, "%s: "DFID" is on remote MDT.\n",
1413                        mdt_obd_name(info->mti_mdt),
1414                        PFID(rr->rr_fid2));
1415                 GOTO(out, rc = -EREMOTE);
1416         } else if (!mdt_object_exists(o)) {
1417                 mdt_set_disposition(info, rep,
1418                                     DISP_IT_EXECD |
1419                                     DISP_LOOKUP_EXECD |
1420                                     DISP_LOOKUP_NEG);
1421                 GOTO(out, rc = -ENOENT);
1422         }
1423
1424         mdt_set_disposition(info, rep, (DISP_IT_EXECD | DISP_LOOKUP_EXECD));
1425
1426         rc = mdt_attr_get_complex(info, o, ma);
1427         if (rc)
1428                 GOTO(out, rc);
1429
1430         rc = mdt_object_open_lock(info, o, lhc, &ibits);
1431         if (rc)
1432                 GOTO(out_unlock, rc);
1433
1434         if (ma->ma_valid & MA_PFID) {
1435                 parent = mdt_object_find(env, mdt, &ma->ma_pfid);
1436                 if (IS_ERR(parent)) {
1437                         CDEBUG(D_INODE, "Fail to find parent "DFID
1438                                " for anonymous created %ld, try to"
1439                                " use system default.\n",
1440                                PFID(&ma->ma_pfid), PTR_ERR(parent));
1441                         parent = NULL;
1442                 }
1443         }
1444
1445         rc = mdt_finish_open(info, parent, o, flags, 0, rep);
1446         if (!rc) {
1447                 mdt_set_disposition(info, rep, DISP_LOOKUP_POS);
1448                 if (flags & MDS_OPEN_LOCK)
1449                         mdt_set_disposition(info, rep, DISP_OPEN_LOCK);
1450                 if (flags & MDS_OPEN_LEASE)
1451                         mdt_set_disposition(info, rep, DISP_OPEN_LEASE);
1452         }
1453         GOTO(out_unlock, rc);
1454
1455 out_unlock:
1456         mdt_object_open_unlock(info, o, lhc, ibits, rc);
1457 out:
1458         mdt_object_put(env, o);
1459         if (parent != NULL)
1460                 mdt_object_put(env, parent);
1461         return rc;
1462 }
1463
1464 int mdt_pin(struct mdt_thread_info* info)
1465 {
1466         ENTRY;
1467         RETURN(err_serious(-EOPNOTSUPP));
1468 }
1469
1470 /* Cross-ref request. Currently it can only be a pure open (w/o create) */
1471 static int mdt_cross_open(struct mdt_thread_info *info,
1472                           const struct lu_fid *parent_fid,
1473                           const struct lu_fid *fid,
1474                           struct ldlm_reply *rep, __u32 flags)
1475 {
1476         struct md_attr    *ma = &info->mti_attr;
1477         struct mdt_object *o;
1478         int                rc;
1479         ENTRY;
1480
1481         o = mdt_object_find(info->mti_env, info->mti_mdt, fid);
1482         if (IS_ERR(o))
1483                 RETURN(rc = PTR_ERR(o));
1484
1485         if (mdt_object_remote(o)) {
1486                 /* Something is wrong here, the object is on another MDS! */
1487                 CERROR("%s: "DFID" isn't on this server!: rc = %d\n",
1488                        mdt_obd_name(info->mti_mdt), PFID(fid), -EFAULT);
1489                 LU_OBJECT_DEBUG(D_WARNING, info->mti_env,
1490                                 &o->mot_obj,
1491                                 "Object isn't on this server! FLD error?\n");
1492                 rc = -EFAULT;
1493         } else {
1494                 if (mdt_object_exists(o)) {
1495                         /* Do permission check for cross-open. */
1496                         rc = mo_permission(info->mti_env, NULL,
1497                                            mdt_object_child(o),
1498                                            NULL, flags | MDS_OPEN_CROSS);
1499                         if (rc)
1500                                 goto out;
1501
1502                         mdt_set_capainfo(info, 0, fid, BYPASS_CAPA);
1503                         rc = mdt_attr_get_complex(info, o, ma);
1504                         if (rc != 0)
1505                                 GOTO(out, rc);
1506
1507                         /* Do not create lov object if the fid is opened
1508                          * under OBF */
1509                         if (S_ISREG(ma->ma_attr.la_mode) &&
1510                             !(ma->ma_valid & MA_LOV) && (flags & FMODE_WRITE) &&
1511                             fid_is_obf(parent_fid))
1512                                 GOTO(out, rc = -EPERM);
1513
1514                         rc = mdt_finish_open(info, NULL, o, flags, 0, rep);
1515                 } else {
1516                         /*
1517                          * Something is wrong here. lookup was positive but
1518                          * there is no object!
1519                          */
1520                         CERROR("%s: "DFID" doesn't exist!: rc = %d\n",
1521                               mdt_obd_name(info->mti_mdt), PFID(fid), -EFAULT);
1522                         rc = -EFAULT;
1523                 }
1524         }
1525 out:
1526         mdt_object_put(info->mti_env, o);
1527         RETURN(rc);
1528 }
1529
1530 int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc)
1531 {
1532         struct mdt_device       *mdt = info->mti_mdt;
1533         struct ptlrpc_request   *req = mdt_info_req(info);
1534         struct mdt_object       *parent;
1535         struct mdt_object       *child;
1536         struct mdt_lock_handle  *lh;
1537         struct ldlm_reply       *ldlm_rep;
1538         struct mdt_body         *repbody;
1539         struct lu_fid           *child_fid = &info->mti_tmp_fid1;
1540         struct md_attr          *ma = &info->mti_attr;
1541         __u64                    create_flags = info->mti_spec.sp_cr_flags;
1542         __u64                    ibits;
1543         struct mdt_reint_record *rr = &info->mti_rr;
1544         struct lu_name          *lname;
1545         int                      result, rc;
1546         int                      created = 0;
1547         __u32                    msg_flags;
1548         ENTRY;
1549
1550         OBD_FAIL_TIMEOUT_ORSET(OBD_FAIL_MDS_PAUSE_OPEN, OBD_FAIL_ONCE,
1551                                (obd_timeout + 1) / 4);
1552
1553         mdt_counter_incr(req, LPROC_MDT_OPEN);
1554         repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
1555
1556         ma->ma_lmm = req_capsule_server_get(info->mti_pill, &RMF_MDT_MD);
1557         ma->ma_lmm_size = req_capsule_get_size(info->mti_pill, &RMF_MDT_MD,
1558                                                RCL_SERVER);
1559         ma->ma_need = MA_INODE;
1560         if (ma->ma_lmm_size > 0)
1561                 ma->ma_need |= MA_LOV;
1562
1563         ma->ma_valid = 0;
1564
1565         LASSERT(info->mti_pill->rc_fmt == &RQF_LDLM_INTENT_OPEN);
1566         ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP);
1567
1568         if (unlikely(create_flags & MDS_OPEN_JOIN_FILE)) {
1569                 CERROR("file join is not supported anymore.\n");
1570                 GOTO(out, result = err_serious(-EOPNOTSUPP));
1571         }
1572         msg_flags = lustre_msg_get_flags(req->rq_reqmsg);
1573
1574         if ((create_flags & (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS)) &&
1575             info->mti_spec.u.sp_ea.eadata == NULL)
1576                 GOTO(out, result = err_serious(-EINVAL));
1577
1578         CDEBUG(D_INODE, "I am going to open "DFID"/(%s->"DFID") "
1579                "cr_flag="LPO64" mode=0%06o msg_flag=0x%x\n",
1580                PFID(rr->rr_fid1), rr->rr_name,
1581                PFID(rr->rr_fid2), create_flags,
1582                ma->ma_attr.la_mode, msg_flags);
1583         if (info->mti_cross_ref) {
1584                 /* This is cross-ref open */
1585                 mdt_set_disposition(info, ldlm_rep,
1586                             (DISP_IT_EXECD | DISP_LOOKUP_EXECD |
1587                              DISP_LOOKUP_POS));
1588                 result = mdt_cross_open(info, rr->rr_fid2, rr->rr_fid1,
1589                                         ldlm_rep, create_flags);
1590                 GOTO(out, result);
1591         } else if (req_is_replay(req) ||
1592             (req->rq_export->exp_libclient && create_flags & MDS_OPEN_HAS_EA)) {
1593                 /* This is a replay request or from liblustre with ea. */
1594                 result = mdt_open_by_fid(info, ldlm_rep);
1595
1596                 if (result != -ENOENT) {
1597                         if (req->rq_export->exp_libclient &&
1598                             create_flags & MDS_OPEN_HAS_EA)
1599                                 GOTO(out, result = 0);
1600                         GOTO(out, result);
1601                 }
1602                 /* We didn't find the correct object, so we need to re-create it
1603                  * via a regular replay. */
1604                 if (!(create_flags & MDS_OPEN_CREAT)) {
1605                         DEBUG_REQ(D_ERROR, req,
1606                                   "OPEN & CREAT not in open replay/by_fid.");
1607                         GOTO(out, result = -EFAULT);
1608                 }
1609                 CDEBUG(D_INFO, "No object(1), continue as regular open.\n");
1610         } else if ((rr->rr_namelen == 0 && create_flags & MDS_OPEN_LOCK) ||
1611                    (create_flags & MDS_OPEN_BY_FID)) {
1612                 result = mdt_open_by_fid_lock(info, ldlm_rep, lhc);
1613                 /* If result is 0 then open by FID has found the file
1614                  * and there is nothing left for us to do here.  More
1615                  * generally if it is anything other than -ENOENT or
1616                  * -EREMOTE then we return that now.  If -ENOENT and
1617                  * MDS_OPEN_CREAT is set then we must create the file
1618                  * below.  If -EREMOTE then we need to return a LOOKUP
1619                  * lock to the client, which we do below.  Hence this
1620                  * odd looking condition.  See LU-2523. */
1621                 if (!(result == -ENOENT && (create_flags & MDS_OPEN_CREAT)) &&
1622                     result != -EREMOTE)
1623                         GOTO(out, result);
1624
1625                 if (unlikely(rr->rr_namelen == 0))
1626                         GOTO(out, result = -EINVAL);
1627
1628                 CDEBUG(D_INFO, "No object(2), continue as regular open.\n");
1629         }
1630
1631         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OPEN_PACK))
1632                 GOTO(out, result = err_serious(-ENOMEM));
1633
1634         mdt_set_disposition(info, ldlm_rep,
1635                             (DISP_IT_EXECD | DISP_LOOKUP_EXECD));
1636
1637         lh = &info->mti_lh[MDT_LH_PARENT];
1638         mdt_lock_pdo_init(lh, (create_flags & MDS_OPEN_CREAT) ?
1639                           LCK_PW : LCK_PR, rr->rr_name, rr->rr_namelen);
1640
1641         parent = mdt_object_find_lock(info, rr->rr_fid1, lh,
1642                                       MDS_INODELOCK_UPDATE);
1643         if (IS_ERR(parent))
1644                 GOTO(out, result = PTR_ERR(parent));
1645
1646         /* get and check version of parent */
1647         result = mdt_version_get_check(info, parent, 0);
1648         if (result)
1649                 GOTO(out_parent, result);
1650
1651         fid_zero(child_fid);
1652
1653         lname = mdt_name(info->mti_env, (char *)rr->rr_name, rr->rr_namelen);
1654         result = mdo_lookup(info->mti_env, mdt_object_child(parent),
1655                             lname, child_fid, &info->mti_spec);
1656         LASSERTF(ergo(result == 0, fid_is_sane(child_fid)),
1657                  "looking for "DFID"/%s, result fid="DFID"\n",
1658                  PFID(mdt_object_fid(parent)), rr->rr_name, PFID(child_fid));
1659
1660         if (result != 0 && result != -ENOENT && result != -ESTALE)
1661                 GOTO(out_parent, result);
1662
1663         if (result == -ENOENT || result == -ESTALE) {
1664                 mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_NEG);
1665                 if (result == -ESTALE) {
1666                         /*
1667                          * -ESTALE means the parent is a dead(unlinked) dir, so
1668                          * it should return -ENOENT to in accordance with the
1669                          * original mds implementaion.
1670                          */
1671                         GOTO(out_parent, result = -ENOENT);
1672                 }
1673                 if (!(create_flags & MDS_OPEN_CREAT))
1674                         GOTO(out_parent, result);
1675                 *child_fid = *info->mti_rr.rr_fid2;
1676                 LASSERTF(fid_is_sane(child_fid), "fid="DFID"\n",
1677                          PFID(child_fid));
1678                 /* In the function below, .hs_keycmp resolves to
1679                  * lu_obj_hop_keycmp() */
1680                 /* coverity[overrun-buffer-val] */
1681                 child = mdt_object_new(info->mti_env, mdt, child_fid);
1682         } else {
1683                 /*
1684                  * Check for O_EXCL is moved to the mdt_finish_open(), we need to
1685                  * return FID back in that case.
1686                  */
1687                 mdt_set_disposition(info, ldlm_rep, DISP_LOOKUP_POS);
1688                 child = mdt_object_find(info->mti_env, mdt, child_fid);
1689         }
1690         if (IS_ERR(child))
1691                 GOTO(out_parent, result = PTR_ERR(child));
1692
1693         /** check version of child  */
1694         rc = mdt_version_get_check(info, child, 1);
1695         if (rc)
1696                 GOTO(out_child, result = rc);
1697
1698         mdt_set_capainfo(info, 1, child_fid, BYPASS_CAPA);
1699         if (result == -ENOENT) {
1700                 /* Create under OBF and .lustre is not permitted */
1701                 if (fid_is_obf(rr->rr_fid1) || fid_is_dot_lustre(rr->rr_fid1))
1702                         GOTO(out_child, result = -EPERM);
1703
1704                 /* save versions in reply */
1705                 mdt_version_get_save(info, parent, 0);
1706                 mdt_version_get_save(info, child, 1);
1707
1708                 /* version of child will be changed */
1709                 info->mti_mos = child;
1710
1711                 /* Not found and with MDS_OPEN_CREAT: let's create it. */
1712                 mdt_set_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
1713
1714                 /* Let lower layers know what is lock mode on directory. */
1715                 info->mti_spec.sp_cr_mode =
1716                         mdt_dlm_mode2mdl_mode(lh->mlh_pdo_mode);
1717
1718                 /*
1719                  * Do not perform lookup sanity check. We know that name does
1720                  * not exist.
1721                  */
1722                 info->mti_spec.sp_cr_lookup = 0;
1723                 info->mti_spec.sp_feat = &dt_directory_features;
1724
1725                 result = mdo_create(info->mti_env,
1726                                     mdt_object_child(parent),
1727                                     lname,
1728                                     mdt_object_child(child),
1729                                     &info->mti_spec,
1730                                     &info->mti_attr);
1731                 if (result == -ERESTART) {
1732                         mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
1733                         GOTO(out_child, result);
1734                 } else {
1735
1736                         /* XXX: we should call this once, see few lines below */
1737                         if (result == 0)
1738                                 result = mdt_attr_get_complex(info, child, ma);
1739
1740                         if (result != 0)
1741                                 GOTO(out_child, result);
1742                 }
1743                 created = 1;
1744         } else {
1745                 /*
1746                  * The object is on remote node, return its FID for remote open.
1747                  */
1748                 if (mdt_object_remote(child)) {
1749                         /*
1750                          * Check if this lock already was sent to client and
1751                          * this is resent case. For resent case do not take lock
1752                          * again, use what is already granted.
1753                          */
1754                         LASSERT(lhc != NULL);
1755
1756                         if (lustre_handle_is_used(&lhc->mlh_reg_lh)) {
1757                                 struct ldlm_lock *lock;
1758
1759                                 LASSERT(msg_flags & MSG_RESENT);
1760
1761                                 lock = ldlm_handle2lock(&lhc->mlh_reg_lh);
1762                                 if (!lock) {
1763                                         CERROR("Invalid lock handle "LPX64"\n",
1764                                                lhc->mlh_reg_lh.cookie);
1765                                         LBUG();
1766                                 }
1767                                 LASSERT(fid_res_name_eq(mdt_object_fid(child),
1768                                                         &lock->l_resource->lr_name));
1769                                 LDLM_LOCK_PUT(lock);
1770                                 rc = 0;
1771                         } else {
1772                                 mdt_lock_handle_init(lhc);
1773                                 mdt_lock_reg_init(lhc, LCK_PR);
1774
1775                                 rc = mdt_object_lock(info, child, lhc,
1776                                                      MDS_INODELOCK_LOOKUP,
1777                                                      MDT_CROSS_LOCK);
1778                         }
1779                         repbody->fid1 = *mdt_object_fid(child);
1780                         repbody->valid |= (OBD_MD_FLID | OBD_MD_MDS);
1781                         if (rc != 0)
1782                                 result = rc;
1783                         else
1784                                 result = -EREMOTE;
1785                         GOTO(out_child, result);
1786                 } else {
1787                         if (mdt_object_exists(child)) {
1788                                 /* We have to get attr & LOV EA & HSM for this
1789                                  * object */
1790                                 ma->ma_need |= MA_HSM;
1791                                 result = mdt_attr_get_complex(info, child, ma);
1792                         } else {
1793                                 /*object non-exist!!!*/
1794                                 LBUG();
1795                         }
1796                 }
1797         }
1798
1799         LASSERT(!lustre_handle_is_used(&lhc->mlh_reg_lh));
1800
1801         /* get openlock if this is not replay and if a client requested it */
1802         if (!req_is_replay(req)) {
1803                 rc = mdt_object_open_lock(info, child, lhc, &ibits);
1804                 if (rc != 0)
1805                         GOTO(out_child_unlock, result = rc);
1806                 else if (create_flags & MDS_OPEN_LOCK)
1807                         mdt_set_disposition(info, ldlm_rep, DISP_OPEN_LOCK);
1808         }
1809
1810         /* Try to open it now. */
1811         rc = mdt_finish_open(info, parent, child, create_flags,
1812                              created, ldlm_rep);
1813         if (rc) {
1814                 result = rc;
1815                 /* openlock will be released if mdt_finish_open failed */
1816                 mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_LOCK);
1817                 if (created) {
1818                         ma->ma_need = 0;
1819                         ma->ma_valid = 0;
1820                         ma->ma_cookie_size = 0;
1821                         rc = mdo_unlink(info->mti_env,
1822                                         mdt_object_child(parent),
1823                                         mdt_object_child(child),
1824                                         lname,
1825                                         &info->mti_attr, 0);
1826                         if (rc != 0)
1827                                 CERROR("%s: "DFID" cleanup of open: rc = %d\n",
1828                                        mdt_obd_name(info->mti_mdt),
1829                                        PFID(mdt_object_fid(child)), rc);
1830                         mdt_clear_disposition(info, ldlm_rep, DISP_OPEN_CREATE);
1831                 }
1832         }
1833         EXIT;
1834 out_child_unlock:
1835         mdt_object_open_unlock(info, child, lhc, ibits, result);
1836 out_child:
1837         mdt_object_put(info->mti_env, child);
1838 out_parent:
1839         mdt_object_unlock_put(info, parent, lh, result || !created);
1840 out:
1841         if (result)
1842                 lustre_msg_set_transno(req->rq_repmsg, 0);
1843         return result;
1844 }
1845
1846 #define MFD_CLOSED(mode) (((mode) & ~(MDS_FMODE_EPOCH | MDS_FMODE_SOM | \
1847                                       MDS_FMODE_TRUNC)) == MDS_FMODE_CLOSED)
1848
1849 static int mdt_mfd_closed(struct mdt_file_data *mfd)
1850 {
1851         return ((mfd == NULL) || MFD_CLOSED(mfd->mfd_mode));
1852 }
1853
1854 int mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd)
1855 {
1856         struct mdt_object *o = mfd->mfd_object;
1857         struct md_object *next = mdt_object_child(o);
1858         struct md_attr *ma = &info->mti_attr;
1859         int ret = MDT_IOEPOCH_CLOSED;
1860         int rc = 0;
1861         __u64 mode;
1862         ENTRY;
1863
1864         mode = mfd->mfd_mode;
1865
1866         if ((mode & FMODE_WRITE) || (mode & MDS_FMODE_TRUNC)) {
1867                 mdt_write_put(o);
1868                 ret = mdt_ioepoch_close(info, o);
1869         } else if (mode & MDS_FMODE_EXEC) {
1870                 mdt_write_allow(o);
1871         } else if (mode & MDS_FMODE_EPOCH) {
1872                 ret = mdt_ioepoch_close(info, o);
1873         } else if (mode & MDS_FMODE_SOM) {
1874                 ret = mdt_som_au_close(info, o);
1875         }
1876
1877         /* Update atime on close only. */
1878         if ((mode & MDS_FMODE_EXEC || mode & FMODE_READ || mode & FMODE_WRITE)
1879             && (ma->ma_valid & MA_INODE) && (ma->ma_attr.la_valid & LA_ATIME)) {
1880                 /* Set the atime only. */
1881                 ma->ma_valid = MA_INODE;
1882                 ma->ma_attr.la_valid = LA_ATIME;
1883                 rc = mo_attr_set(info->mti_env, next, ma);
1884         }
1885
1886         /* If file data is modified, add the dirty flag.
1887          *
1888          * If MDS_CLOSE_CLEANUP is set, this file is being closed due to an
1889          * eviction, file could have been modified and now dirty
1890          * regarding to HSM archive, check this!
1891          * The logic here is to mark a file dirty if there's a chance it was
1892          * dirtied before the client was evicted, so that we don't have to wait
1893          * for a release attempt before finding out the file was actually dirty
1894          * and fail the release. Aggressively marking it dirty here will cause
1895          * the policy engine to attempt to re-archive it; when rearchiving, we
1896          * can compare the current version to the LMA data_version and make the
1897          * archive request into a noop if it's not actually dirty.
1898          */
1899         if ((ma->ma_attr_flags & MDS_DATA_MODIFIED) ||
1900             ((ma->ma_attr_flags & MDS_CLOSE_CLEANUP) &&
1901              (mode & (FMODE_WRITE|MDS_FMODE_TRUNC))))
1902                 rc = mdt_add_dirty_flag(info, o, ma);
1903
1904         ma->ma_need |= MA_INODE;
1905         ma->ma_valid &= ~MA_INODE;
1906
1907         if (!MFD_CLOSED(mode))
1908                 rc = mo_close(info->mti_env, next, ma, mode);
1909
1910         if (ret == MDT_IOEPOCH_GETATTR || ret == MDT_IOEPOCH_OPENED) {
1911                 struct mdt_export_data *med;
1912
1913                 /* The IOepoch is still opened or SOM update is needed.
1914                  * Put mfd back into the list. */
1915                 LASSERT(mdt_conn_flags(info) & OBD_CONNECT_SOM);
1916                 mdt_mfd_set_mode(mfd, ret == MDT_IOEPOCH_OPENED ?
1917                                       MDS_FMODE_EPOCH : MDS_FMODE_SOM);
1918
1919                 LASSERT(mdt_info_req(info));
1920                 med = &mdt_info_req(info)->rq_export->exp_mdt_data;
1921                 spin_lock(&med->med_open_lock);
1922                 cfs_list_add(&mfd->mfd_list, &med->med_open_head);
1923                 class_handle_hash_back(&mfd->mfd_handle);
1924                 spin_unlock(&med->med_open_lock);
1925
1926                 if (ret == MDT_IOEPOCH_OPENED) {
1927                         ret = 0;
1928                 } else {
1929                         ret = -EAGAIN;
1930                         CDEBUG(D_INODE, "Size-on-MDS attribute update is "
1931                                "needed on "DFID"\n", PFID(mdt_object_fid(o)));
1932                 }
1933         } else {
1934                 /* adjust open and lease count */
1935                 if (mode & MDS_OPEN_LEASE) {
1936                         LASSERT(atomic_read(&o->mot_lease_count) > 0);
1937                         atomic_dec(&o->mot_lease_count);
1938                 }
1939                 LASSERT(atomic_read(&o->mot_open_count) > 0);
1940                 atomic_dec(&o->mot_open_count);
1941
1942                 mdt_mfd_free(mfd);
1943                 mdt_object_put(info->mti_env, o);
1944         }
1945
1946         RETURN(rc ? rc : ret);
1947 }
1948
1949 int mdt_close(struct mdt_thread_info *info)
1950 {
1951         struct mdt_export_data *med;
1952         struct mdt_file_data   *mfd;
1953         struct mdt_object      *o;
1954         struct md_attr         *ma = &info->mti_attr;
1955         struct mdt_body        *repbody = NULL;
1956         struct ptlrpc_request  *req = mdt_info_req(info);
1957         int rc, ret = 0;
1958         ENTRY;
1959
1960         mdt_counter_incr(req, LPROC_MDT_CLOSE);
1961         /* Close may come with the Size-on-MDS update. Unpack it. */
1962         rc = mdt_close_unpack(info);
1963         if (rc)
1964                 RETURN(err_serious(rc));
1965
1966         LASSERT(info->mti_ioepoch);
1967
1968         req_capsule_set_size(info->mti_pill, &RMF_MDT_MD, RCL_SERVER,
1969                              info->mti_mdt->mdt_max_mdsize);
1970         req_capsule_set_size(info->mti_pill, &RMF_LOGCOOKIES, RCL_SERVER,
1971                              info->mti_mdt->mdt_max_cookiesize);
1972         rc = req_capsule_server_pack(info->mti_pill);
1973         if (mdt_check_resent(info, mdt_reconstruct_generic, NULL)) {
1974                 mdt_client_compatibility(info);
1975                 if (rc == 0)
1976                         mdt_fix_reply(info);
1977                 mdt_exit_ucred(info);
1978                 RETURN(lustre_msg_get_status(req->rq_repmsg));
1979         }
1980
1981         /* Continue to close handle even if we can not pack reply */
1982         if (rc == 0) {
1983                 repbody = req_capsule_server_get(info->mti_pill,
1984                                                  &RMF_MDT_BODY);
1985                 ma->ma_lmm = req_capsule_server_get(info->mti_pill,
1986                                                     &RMF_MDT_MD);
1987                 ma->ma_lmm_size = req_capsule_get_size(info->mti_pill,
1988                                                        &RMF_MDT_MD,
1989                                                        RCL_SERVER);
1990                 ma->ma_cookie = req_capsule_server_get(info->mti_pill,
1991                                                        &RMF_LOGCOOKIES);
1992                 ma->ma_cookie_size = req_capsule_get_size(info->mti_pill,
1993                                                           &RMF_LOGCOOKIES,
1994                                                           RCL_SERVER);
1995                 ma->ma_need = MA_INODE | MA_LOV | MA_COOKIE;
1996                 repbody->eadatasize = 0;
1997                 repbody->aclsize = 0;
1998         } else {
1999                 rc = err_serious(rc);
2000         }
2001
2002         med = &req->rq_export->exp_mdt_data;
2003         spin_lock(&med->med_open_lock);
2004         mfd = mdt_handle2mfd(med, &info->mti_ioepoch->handle,
2005                              req_is_replay(req));
2006         if (mdt_mfd_closed(mfd)) {
2007                 spin_unlock(&med->med_open_lock);
2008                 CDEBUG(D_INODE, "no handle for file close: fid = "DFID
2009                        ": cookie = "LPX64"\n", PFID(info->mti_rr.rr_fid1),
2010                        info->mti_ioepoch->handle.cookie);
2011                 /** not serious error since bug 3633 */
2012                 rc = -ESTALE;
2013         } else {
2014                 class_handle_unhash(&mfd->mfd_handle);
2015                 cfs_list_del_init(&mfd->mfd_list);
2016                 spin_unlock(&med->med_open_lock);
2017
2018                 /* Do not lose object before last unlink. */
2019                 o = mfd->mfd_object;
2020                 mdt_object_get(info->mti_env, o);
2021                 ret = mdt_mfd_close(info, mfd);
2022                 if (repbody != NULL)
2023                         rc = mdt_handle_last_unlink(info, o, ma);
2024                 mdt_empty_transno(info, rc);
2025                 mdt_object_put(info->mti_env, o);
2026         }
2027         if (repbody != NULL) {
2028                 mdt_client_compatibility(info);
2029                 rc = mdt_fix_reply(info);
2030         }
2031
2032         mdt_exit_ucred(info);
2033         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CLOSE_PACK))
2034                 RETURN(err_serious(-ENOMEM));
2035
2036         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_MDS_CLOSE_NET_REP,
2037                                  OBD_FAIL_MDS_CLOSE_NET_REP))
2038                 info->mti_fail_id = OBD_FAIL_MDS_CLOSE_NET_REP;
2039         RETURN(rc ? rc : ret);
2040 }
2041
2042 /**
2043  * DONE_WRITING rpc handler.
2044  *
2045  * As mfd is not kept after replayed CLOSE (see mdt_ioepoch_close_on_replay()),
2046  * only those DONE_WRITING rpc will be replayed which really wrote smth on disk,
2047  * and got a trasid. Waiting for such DONE_WRITING is not reliable, so just
2048  * skip attributes and reconstruct the reply here.
2049  */
2050 int mdt_done_writing(struct mdt_thread_info *info)
2051 {
2052         struct ptlrpc_request   *req = mdt_info_req(info);
2053         struct mdt_body         *repbody = NULL;
2054         struct mdt_export_data  *med;
2055         struct mdt_file_data    *mfd;
2056         int rc;
2057         ENTRY;
2058
2059         rc = req_capsule_server_pack(info->mti_pill);
2060         if (rc)
2061                 RETURN(err_serious(rc));
2062
2063         repbody = req_capsule_server_get(info->mti_pill,
2064                                          &RMF_MDT_BODY);
2065         repbody->eadatasize = 0;
2066         repbody->aclsize = 0;
2067
2068         /* Done Writing may come with the Size-on-MDS update. Unpack it. */
2069         rc = mdt_close_unpack(info);
2070         if (rc)
2071                 RETURN(err_serious(rc));
2072
2073         if (mdt_check_resent(info, mdt_reconstruct_generic, NULL)) {
2074                 mdt_exit_ucred(info);
2075                 RETURN(lustre_msg_get_status(req->rq_repmsg));
2076         }
2077
2078         med = &info->mti_exp->exp_mdt_data;
2079         spin_lock(&med->med_open_lock);
2080         mfd = mdt_handle2mfd(med, &info->mti_ioepoch->handle,
2081                              req_is_replay(req));
2082         if (mfd == NULL) {
2083                 spin_unlock(&med->med_open_lock);
2084                 CDEBUG(D_INODE, "no handle for done write: fid = "DFID
2085                        ": cookie = "LPX64" ioepoch = "LPU64"\n",
2086                        PFID(info->mti_rr.rr_fid1),
2087                        info->mti_ioepoch->handle.cookie,
2088                        info->mti_ioepoch->ioepoch);
2089                 /* If this is a replay, reconstruct the transno. */
2090                 if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
2091                         rc = info->mti_ioepoch->flags & MF_SOM_AU ?
2092                              -EAGAIN : 0;
2093                         mdt_empty_transno(info, rc);
2094                 } else
2095                         rc = -ESTALE;
2096                 GOTO(error_ucred, rc);
2097         }
2098
2099         LASSERT(mfd->mfd_mode == MDS_FMODE_EPOCH ||
2100                 mfd->mfd_mode == MDS_FMODE_TRUNC);
2101         class_handle_unhash(&mfd->mfd_handle);
2102         cfs_list_del_init(&mfd->mfd_list);
2103         spin_unlock(&med->med_open_lock);
2104
2105         /* Set EPOCH CLOSE flag if not set by client. */
2106         info->mti_ioepoch->flags |= MF_EPOCH_CLOSE;
2107         info->mti_attr.ma_valid = 0;
2108
2109         info->mti_attr.ma_lmm_size = info->mti_mdt->mdt_max_mdsize;
2110         OBD_ALLOC_LARGE(info->mti_attr.ma_lmm, info->mti_mdt->mdt_max_mdsize);
2111         if (info->mti_attr.ma_lmm == NULL)
2112                 GOTO(error_ucred, rc = -ENOMEM);
2113
2114         rc = mdt_mfd_close(info, mfd);
2115
2116         OBD_FREE_LARGE(info->mti_attr.ma_lmm, info->mti_mdt->mdt_max_mdsize);
2117         mdt_empty_transno(info, rc);
2118 error_ucred:
2119         mdt_exit_ucred(info);
2120         RETURN(rc);
2121 }