Whamcloud - gitweb
libext2fs: unix_io: fix_potential error path deadlock in flush_cached_blocks()
[tools/e2fsprogs.git] / lib / ext2fs / unix_io.c
1 /*
2  * unix_io.c --- This is the Unix (well, really POSIX) implementation
3  *      of the I/O manager.
4  *
5  * Implements a one-block write-through cache.
6  *
7  * Includes support for Windows NT support under Cygwin.
8  *
9  * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10  *      2002 by Theodore Ts'o.
11  *
12  * %Begin-Header%
13  * This file may be redistributed under the terms of the GNU Library
14  * General Public License, version 2.
15  * %End-Header%
16  */
17
18 #if !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
19 #define _XOPEN_SOURCE 600
20 #define _DARWIN_C_SOURCE
21 #define _FILE_OFFSET_BITS 64
22 #ifndef _LARGEFILE_SOURCE
23 #define _LARGEFILE_SOURCE
24 #endif
25 #ifndef _LARGEFILE64_SOURCE
26 #define _LARGEFILE64_SOURCE
27 #endif
28 #ifndef _GNU_SOURCE
29 #define _GNU_SOURCE
30 #endif
31 #endif
32
33 #include "config.h"
34 #include <stdio.h>
35 #include <string.h>
36 #if HAVE_UNISTD_H
37 #include <unistd.h>
38 #endif
39 #if HAVE_ERRNO_H
40 #include <errno.h>
41 #endif
42 #include <fcntl.h>
43 #include <time.h>
44 #ifdef __linux__
45 #include <sys/utsname.h>
46 #endif
47 #if HAVE_SYS_TYPES_H
48 #include <sys/types.h>
49 #endif
50 #ifdef HAVE_SYS_IOCTL_H
51 #include <sys/ioctl.h>
52 #endif
53 #ifdef HAVE_SYS_MOUNT_H
54 #include <sys/mount.h>
55 #endif
56 #ifdef HAVE_SYS_PRCTL_H
57 #include <sys/prctl.h>
58 #else
59 #define PR_GET_DUMPABLE 3
60 #endif
61 #if HAVE_SYS_STAT_H
62 #include <sys/stat.h>
63 #endif
64 #if HAVE_SYS_RESOURCE_H
65 #include <sys/resource.h>
66 #endif
67 #if HAVE_LINUX_FALLOC_H
68 #include <linux/falloc.h>
69 #endif
70 #ifdef HAVE_PTHREAD
71 #include <pthread.h>
72 #endif
73
74 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
75 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
76 #endif
77
78 #undef ALIGN_DEBUG
79
80 #include "ext2_fs.h"
81 #include "ext2fs.h"
82 #include "ext2fsP.h"
83
84 /*
85  * For checking structure magic numbers...
86  */
87
88 #define EXT2_CHECK_MAGIC(struct, code) \
89           if ((struct)->magic != (code)) return (code)
90
91 struct unix_cache {
92         char                    *buf;
93         unsigned long long      block;
94         int                     access_time;
95         unsigned                dirty:1;
96         unsigned                in_use:1;
97         unsigned                write_err:1;
98 };
99
100 #define CACHE_SIZE 8
101 #define WRITE_DIRECT_SIZE 4     /* Must be smaller than CACHE_SIZE */
102 #define READ_DIRECT_SIZE 4      /* Should be smaller than CACHE_SIZE */
103
104 struct unix_private_data {
105         int     magic;
106         int     dev;
107         int     flags;
108         int     align;
109         int     access_time;
110         ext2_loff_t offset;
111         struct unix_cache cache[CACHE_SIZE];
112         void    *bounce;
113         struct struct_io_stats io_stats;
114 #ifdef HAVE_PTHREAD
115         pthread_mutex_t cache_mutex;
116         pthread_mutex_t bounce_mutex;
117         pthread_mutex_t stats_mutex;
118 #endif
119 };
120
121 #define IS_ALIGNED(n, align) ((((uintptr_t) n) & \
122                                ((uintptr_t) ((align)-1))) == 0)
123
124 typedef enum lock_kind {
125         CACHE_MTX, BOUNCE_MTX, STATS_MTX
126 } kind_t;
127
128 #ifdef HAVE_PTHREAD
129 static inline pthread_mutex_t *get_mutex(struct unix_private_data *data,
130                                          kind_t kind)
131 {
132         if (data->flags & IO_FLAG_THREADS) {
133                 switch (kind) {
134                 case CACHE_MTX:
135                         return &data->cache_mutex;
136                 case BOUNCE_MTX:
137                         return &data->bounce_mutex;
138                 case STATS_MTX:
139                         return &data->stats_mutex;
140                 }
141         }
142         return NULL;
143 }
144 #endif
145
146 static inline void mutex_lock(struct unix_private_data *data, kind_t kind)
147 {
148 #ifdef HAVE_PTHREAD
149         pthread_mutex_t *mtx = get_mutex(data,kind);
150
151         if (mtx)
152                 pthread_mutex_lock(mtx);
153 #endif
154 }
155
156 static inline void mutex_unlock(struct unix_private_data *data, kind_t kind)
157 {
158 #ifdef HAVE_PTHREAD
159         pthread_mutex_t *mtx = get_mutex(data,kind);
160
161         if (mtx)
162                 pthread_mutex_unlock(mtx);
163 #endif
164 }
165
166 static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
167 {
168         errcode_t       retval = 0;
169
170         struct unix_private_data *data;
171
172         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
173         data = (struct unix_private_data *) channel->private_data;
174         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
175
176         if (stats) {
177                 mutex_lock(data, STATS_MTX);
178                 *stats = &data->io_stats;
179                 mutex_unlock(data, STATS_MTX);
180         }
181
182         return retval;
183 }
184
185 static char *safe_getenv(const char *arg)
186 {
187         if ((getuid() != geteuid()) || (getgid() != getegid()))
188                 return NULL;
189 #ifdef HAVE_PRCTL
190         if (prctl(PR_GET_DUMPABLE, 0, 0, 0, 0) == 0)
191                 return NULL;
192 #else
193 #if (defined(linux) && defined(SYS_prctl))
194         if (syscall(SYS_prctl, PR_GET_DUMPABLE, 0, 0, 0, 0) == 0)
195                 return NULL;
196 #endif
197 #endif
198
199 #if defined(HAVE_SECURE_GETENV)
200         return secure_getenv(arg);
201 #elif defined(HAVE___SECURE_GETENV)
202         return __secure_getenv(arg);
203 #else
204         return getenv(arg);
205 #endif
206 }
207
208 /*
209  * Here are the raw I/O functions
210  */
211 static errcode_t raw_read_blk(io_channel channel,
212                               struct unix_private_data *data,
213                               unsigned long long block,
214                               int count, void *bufv)
215 {
216         errcode_t       retval;
217         ssize_t         size;
218         ext2_loff_t     location;
219         int             actual = 0;
220         unsigned char   *buf = bufv;
221         ssize_t         really_read = 0;
222         unsigned long long aligned_blk;
223         int             align_size, offset;
224
225         size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
226         mutex_lock(data, STATS_MTX);
227         data->io_stats.bytes_read += size;
228         mutex_unlock(data, STATS_MTX);
229         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
230
231         if (data->flags & IO_FLAG_FORCE_BOUNCE)
232                 goto bounce_read;
233
234 #ifdef HAVE_PREAD64
235         /* Try an aligned pread */
236         if ((channel->align == 0) ||
237             (IS_ALIGNED(buf, channel->align) &&
238              IS_ALIGNED(location, channel->align) &&
239              IS_ALIGNED(size, channel->align))) {
240                 actual = pread64(data->dev, buf, size, location);
241                 if (actual == size)
242                         return 0;
243                 actual = 0;
244         }
245 #elif HAVE_PREAD
246         /* Try an aligned pread */
247         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
248             ((channel->align == 0) ||
249              (IS_ALIGNED(buf, channel->align) &&
250               IS_ALIGNED(location, channel->align) &&
251               IS_ALIGNED(size, channel->align)))) {
252                 actual = pread(data->dev, buf, size, location);
253                 if (actual == size)
254                         return 0;
255                 actual = 0;
256         }
257 #endif /* HAVE_PREAD */
258
259         if ((channel->align == 0) ||
260             (IS_ALIGNED(buf, channel->align) &&
261              IS_ALIGNED(location, channel->align) &&
262              IS_ALIGNED(size, channel->align))) {
263                 mutex_lock(data, BOUNCE_MTX);
264                 if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
265                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
266                         goto error_unlock;
267                 }
268                 actual = read(data->dev, buf, size);
269                 if (actual != size) {
270                 short_read:
271                         if (actual < 0) {
272                                 retval = errno;
273                                 actual = 0;
274                         } else
275                                 retval = EXT2_ET_SHORT_READ;
276                         goto error_unlock;
277                 }
278                 goto success_unlock;
279         }
280
281 #ifdef ALIGN_DEBUG
282         printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
283                (unsigned long) size);
284 #endif
285
286         /*
287          * The buffer or size which we're trying to read isn't aligned
288          * to the O_DIRECT rules, so we need to do this the hard way...
289          */
290 bounce_read:
291         if (channel->align == 0)
292                 channel->align = 1;
293         if ((channel->block_size > channel->align) &&
294             (channel->block_size % channel->align) == 0)
295                 align_size = channel->block_size;
296         else
297                 align_size = channel->align;
298         aligned_blk = location / align_size;
299         offset = location % align_size;
300
301         mutex_lock(data, BOUNCE_MTX);
302         if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
303                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
304                 goto error_unlock;
305         }
306         while (size > 0) {
307                 actual = read(data->dev, data->bounce, align_size);
308                 if (actual != align_size) {
309                         mutex_unlock(data, BOUNCE_MTX);
310                         actual = really_read;
311                         buf -= really_read;
312                         size += really_read;
313                         goto short_read;
314                 }
315                 if ((actual + offset) > align_size)
316                         actual = align_size - offset;
317                 if (actual > size)
318                         actual = size;
319                 memcpy(buf, (char *)data->bounce + offset, actual);
320
321                 really_read += actual;
322                 size -= actual;
323                 buf += actual;
324                 offset = 0;
325                 aligned_blk++;
326         }
327 success_unlock:
328         mutex_unlock(data, BOUNCE_MTX);
329         return 0;
330
331 error_unlock:
332         mutex_unlock(data, BOUNCE_MTX);
333         if (actual >= 0 && actual < size)
334                 memset((char *) buf+actual, 0, size-actual);
335         if (channel->read_error)
336                 retval = (channel->read_error)(channel, block, count, buf,
337                                                size, actual, retval);
338         return retval;
339 }
340
341 #define RAW_WRITE_NO_HANDLER    1
342
343 static errcode_t raw_write_blk(io_channel channel,
344                                struct unix_private_data *data,
345                                unsigned long long block,
346                                int count, const void *bufv,
347                                int flags)
348 {
349         ssize_t         size;
350         ext2_loff_t     location;
351         int             actual = 0;
352         errcode_t       retval;
353         const unsigned char *buf = bufv;
354         unsigned long long aligned_blk;
355         int             align_size, offset;
356
357         if (count == 1)
358                 size = channel->block_size;
359         else {
360                 if (count < 0)
361                         size = -count;
362                 else
363                         size = (ext2_loff_t) count * channel->block_size;
364         }
365         mutex_lock(data, STATS_MTX);
366         data->io_stats.bytes_written += size;
367         mutex_unlock(data, STATS_MTX);
368
369         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
370
371         if (data->flags & IO_FLAG_FORCE_BOUNCE)
372                 goto bounce_write;
373
374 #ifdef HAVE_PWRITE64
375         /* Try an aligned pwrite */
376         if ((channel->align == 0) ||
377             (IS_ALIGNED(buf, channel->align) &&
378              IS_ALIGNED(location, channel->align) &&
379              IS_ALIGNED(size, channel->align))) {
380                 actual = pwrite64(data->dev, buf, size, location);
381                 if (actual == size)
382                         return 0;
383         }
384 #elif HAVE_PWRITE
385         /* Try an aligned pwrite */
386         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
387             ((channel->align == 0) ||
388              (IS_ALIGNED(buf, channel->align) &&
389               IS_ALIGNED(location, channel->align) &&
390               IS_ALIGNED(size, channel->align)))) {
391                 actual = pwrite(data->dev, buf, size, location);
392                 if (actual == size)
393                         return 0;
394         }
395 #endif /* HAVE_PWRITE */
396
397         if ((channel->align == 0) ||
398             (IS_ALIGNED(buf, channel->align) &&
399              IS_ALIGNED(location, channel->align) &&
400              IS_ALIGNED(size, channel->align))) {
401                 mutex_lock(data, BOUNCE_MTX);
402                 if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
403                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
404                         goto error_unlock;
405                 }
406                 actual = write(data->dev, buf, size);
407                 mutex_unlock(data, BOUNCE_MTX);
408                 if (actual < 0) {
409                         retval = errno;
410                         goto error_out;
411                 }
412                 if (actual != size) {
413                 short_write:
414                         retval = EXT2_ET_SHORT_WRITE;
415                         goto error_out;
416                 }
417                 return 0;
418         }
419
420 #ifdef ALIGN_DEBUG
421         printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
422                (unsigned long) size);
423 #endif
424         /*
425          * The buffer or size which we're trying to write isn't aligned
426          * to the O_DIRECT rules, so we need to do this the hard way...
427          */
428 bounce_write:
429         if (channel->align == 0)
430                 channel->align = 1;
431         if ((channel->block_size > channel->align) &&
432             (channel->block_size % channel->align) == 0)
433                 align_size = channel->block_size;
434         else
435                 align_size = channel->align;
436         aligned_blk = location / align_size;
437         offset = location % align_size;
438
439         while (size > 0) {
440                 int actual_w;
441
442                 mutex_lock(data, BOUNCE_MTX);
443                 if (size < align_size || offset) {
444                         if (ext2fs_llseek(data->dev, aligned_blk * align_size,
445                                           SEEK_SET) < 0) {
446                                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
447                                 goto error_unlock;
448                         }
449                         actual = read(data->dev, data->bounce,
450                                       align_size);
451                         if (actual != align_size) {
452                                 if (actual < 0) {
453                                         retval = errno;
454                                         goto error_unlock;
455                                 }
456                                 memset((char *) data->bounce + actual, 0,
457                                        align_size - actual);
458                         }
459                 }
460                 actual = size;
461                 if ((actual + offset) > align_size)
462                         actual = align_size - offset;
463                 if (actual > size)
464                         actual = size;
465                 memcpy(((char *)data->bounce) + offset, buf, actual);
466                 if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
467                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
468                         goto error_unlock;
469                 }
470                 actual_w = write(data->dev, data->bounce, align_size);
471                 mutex_unlock(data, BOUNCE_MTX);
472                 if (actual_w < 0) {
473                         retval = errno;
474                         goto error_out;
475                 }
476                 if (actual_w != align_size)
477                         goto short_write;
478                 size -= actual;
479                 buf += actual;
480                 location += actual;
481                 aligned_blk++;
482                 offset = 0;
483         }
484         return 0;
485
486 error_unlock:
487         mutex_unlock(data, BOUNCE_MTX);
488 error_out:
489         if (((flags & RAW_WRITE_NO_HANDLER) == 0) && channel->write_error)
490                 retval = (channel->write_error)(channel, block, count, buf,
491                                                 size, actual, retval);
492         return retval;
493 }
494
495
496 /*
497  * Here we implement the cache functions
498  */
499
500 /* Allocate the cache buffers */
501 static errcode_t alloc_cache(io_channel channel,
502                              struct unix_private_data *data)
503 {
504         errcode_t               retval;
505         struct unix_cache       *cache;
506         int                     i;
507
508         data->access_time = 0;
509         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
510                 cache->block = 0;
511                 cache->access_time = 0;
512                 cache->dirty = 0;
513                 cache->in_use = 0;
514                 if (cache->buf)
515                         ext2fs_free_mem(&cache->buf);
516                 retval = io_channel_alloc_buf(channel, 0, &cache->buf);
517                 if (retval)
518                         return retval;
519         }
520         if (channel->align || data->flags & IO_FLAG_FORCE_BOUNCE) {
521                 if (data->bounce)
522                         ext2fs_free_mem(&data->bounce);
523                 retval = io_channel_alloc_buf(channel, 0, &data->bounce);
524         }
525         return retval;
526 }
527
528 /* Free the cache buffers */
529 static void free_cache(struct unix_private_data *data)
530 {
531         struct unix_cache       *cache;
532         int                     i;
533
534         data->access_time = 0;
535         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
536                 cache->block = 0;
537                 cache->access_time = 0;
538                 cache->dirty = 0;
539                 cache->in_use = 0;
540                 if (cache->buf)
541                         ext2fs_free_mem(&cache->buf);
542         }
543         if (data->bounce)
544                 ext2fs_free_mem(&data->bounce);
545 }
546
547 #ifndef NO_IO_CACHE
548 /*
549  * Try to find a block in the cache.  If the block is not found, and
550  * eldest is a non-zero pointer, then fill in eldest with the cache
551  * entry to that should be reused.
552  */
553 static struct unix_cache *find_cached_block(struct unix_private_data *data,
554                                             unsigned long long block,
555                                             struct unix_cache **eldest)
556 {
557         struct unix_cache       *cache, *unused_cache, *oldest_cache;
558         int                     i;
559
560         unused_cache = oldest_cache = 0;
561         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
562                 if (!cache->in_use) {
563                         if (!unused_cache)
564                                 unused_cache = cache;
565                         continue;
566                 }
567                 if (cache->block == block) {
568                         cache->access_time = ++data->access_time;
569                         return cache;
570                 }
571                 if (!oldest_cache ||
572                     (cache->access_time < oldest_cache->access_time))
573                         oldest_cache = cache;
574         }
575         if (eldest)
576                 *eldest = (unused_cache) ? unused_cache : oldest_cache;
577         return 0;
578 }
579
580 /*
581  * Reuse a particular cache entry for another block.
582  */
583 static errcode_t reuse_cache(io_channel channel,
584                 struct unix_private_data *data, struct unix_cache *cache,
585                 unsigned long long block)
586 {
587         if (cache->dirty && cache->in_use) {
588                 errcode_t retval;
589
590                 retval = raw_write_blk(channel, data, cache->block, 1,
591                                        cache->buf, RAW_WRITE_NO_HANDLER);
592                 if (retval) {
593                         cache->write_err = 1;
594                         return retval;
595                 }
596         }
597
598         cache->in_use = 1;
599         cache->dirty = 0;
600         cache->write_err = 0;
601         cache->block = block;
602         cache->access_time = ++data->access_time;
603         return 0;
604 }
605
606 #define FLUSH_INVALIDATE        0x01
607 #define FLUSH_NOLOCK            0x02
608
609 /*
610  * Flush all of the blocks in the cache
611  */
612 static errcode_t flush_cached_blocks(io_channel channel,
613                                      struct unix_private_data *data,
614                                      int flags)
615 {
616         struct unix_cache       *cache;
617         errcode_t               retval, retval2 = 0;
618         int                     i;
619         int                     errors_found = 0;
620
621         if ((flags & FLUSH_NOLOCK) == 0)
622                 mutex_lock(data, CACHE_MTX);
623         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
624                 if (!cache->in_use || !cache->dirty)
625                         continue;
626                 retval = raw_write_blk(channel, data,
627                                        cache->block, 1, cache->buf,
628                                        RAW_WRITE_NO_HANDLER);
629                 if (retval) {
630                         cache->write_err = 1;
631                         errors_found = 1;
632                         retval2 = retval;
633                 } else {
634                         cache->dirty = 0;
635                         cache->write_err = 0;
636                         if (flags & FLUSH_INVALIDATE)
637                                 cache->in_use = 0;
638                 }
639         }
640         if ((flags & FLUSH_NOLOCK) == 0)
641                 mutex_unlock(data, CACHE_MTX);
642 retry:
643         while (errors_found) {
644                 if ((flags & FLUSH_NOLOCK) == 0)
645                         mutex_lock(data, CACHE_MTX);
646                 errors_found = 0;
647                 for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
648                         if (!cache->in_use || !cache->write_err)
649                                 continue;
650                         errors_found = 1;
651                         if (cache->write_err && channel->write_error) {
652                                 char *err_buf = NULL;
653                                 unsigned long long err_block = cache->block;
654
655                                 cache->dirty = 0;
656                                 cache->in_use = 0;
657                                 cache->write_err = 0;
658                                 if (io_channel_alloc_buf(channel, 0,
659                                                          &err_buf))
660                                         err_buf = NULL;
661                                 else
662                                         memcpy(err_buf, cache->buf,
663                                                channel->block_size);
664                                 mutex_unlock(data, CACHE_MTX);
665                                 (channel->write_error)(channel, err_block,
666                                         1, err_buf, channel->block_size, -1,
667                                         retval2);
668                                 if (err_buf)
669                                         ext2fs_free_mem(&err_buf);
670                                 goto retry;
671                         } else
672                                 cache->write_err = 0;
673                 }
674                 if ((flags & FLUSH_NOLOCK) == 0)
675                         mutex_unlock(data, CACHE_MTX);
676         }
677         return retval2;
678 }
679 #endif /* NO_IO_CACHE */
680
681 #ifdef __linux__
682 #ifndef BLKDISCARDZEROES
683 #define BLKDISCARDZEROES _IO(0x12,124)
684 #endif
685 #endif
686
687 int ext2fs_open_file(const char *pathname, int flags, mode_t mode)
688 {
689         if (mode)
690 #if defined(HAVE_OPEN64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
691                 return open64(pathname, flags, mode);
692         else
693                 return open64(pathname, flags);
694 #else
695                 return open(pathname, flags, mode);
696         else
697                 return open(pathname, flags);
698 #endif
699 }
700
701 int ext2fs_stat(const char *path, ext2fs_struct_stat *buf)
702 {
703 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
704         return stat64(path, buf);
705 #else
706         return stat(path, buf);
707 #endif
708 }
709
710 int ext2fs_fstat(int fd, ext2fs_struct_stat *buf)
711 {
712 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
713         return fstat64(fd, buf);
714 #else
715         return fstat(fd, buf);
716 #endif
717 }
718
719
720 static errcode_t unix_open_channel(const char *name, int fd,
721                                    int flags, io_channel *channel,
722                                    io_manager io_mgr)
723 {
724         io_channel      io = NULL;
725         struct unix_private_data *data = NULL;
726         errcode_t       retval;
727         ext2fs_struct_stat st;
728 #ifdef __linux__
729         struct          utsname ut;
730 #endif
731
732         if (safe_getenv("UNIX_IO_FORCE_BOUNCE"))
733                 flags |= IO_FLAG_FORCE_BOUNCE;
734
735 #ifdef __linux__
736         /*
737          * We need to make sure any previous errors in the block
738          * device are thrown away, sigh.
739          */
740         (void) fsync(fd);
741 #endif
742
743         retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
744         if (retval)
745                 goto cleanup;
746         memset(io, 0, sizeof(struct struct_io_channel));
747         io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
748         retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
749         if (retval)
750                 goto cleanup;
751
752         io->manager = io_mgr;
753         retval = ext2fs_get_mem(strlen(name)+1, &io->name);
754         if (retval)
755                 goto cleanup;
756
757         strcpy(io->name, name);
758         io->private_data = data;
759         io->block_size = 1024;
760         io->read_error = 0;
761         io->write_error = 0;
762         io->refcount = 1;
763         io->flags = 0;
764
765         memset(data, 0, sizeof(struct unix_private_data));
766         data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
767         data->io_stats.num_fields = 2;
768         data->flags = flags;
769         data->dev = fd;
770
771 #if defined(O_DIRECT)
772         if (flags & IO_FLAG_DIRECT_IO)
773                 io->align = ext2fs_get_dio_alignment(data->dev);
774 #elif defined(F_NOCACHE)
775         if (flags & IO_FLAG_DIRECT_IO)
776                 io->align = 4096;
777 #endif
778
779         /*
780          * If the device is really a block device, then set the
781          * appropriate flag, otherwise we can set DISCARD_ZEROES flag
782          * because we are going to use punch hole instead of discard
783          * and if it succeed, subsequent read from sparse area returns
784          * zero.
785          */
786         if (ext2fs_fstat(data->dev, &st) == 0) {
787                 if (ext2fsP_is_disk_device(st.st_mode))
788                         io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
789                 else
790                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
791         }
792
793 #ifdef BLKDISCARDZEROES
794         {
795                 int zeroes = 0;
796                 if (ioctl(data->dev, BLKDISCARDZEROES, &zeroes) == 0 &&
797                     zeroes)
798                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
799         }
800 #endif
801
802 #if defined(__CYGWIN__)
803         /*
804          * Some operating systems require that the buffers be aligned,
805          * regardless of O_DIRECT
806          */
807         if (!io->align)
808                 io->align = 512;
809 #endif
810
811 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
812         if (io->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
813                 int dio_align = ext2fs_get_dio_alignment(fd);
814
815                 if (io->align < dio_align)
816                         io->align = dio_align;
817         }
818 #endif
819
820         if ((retval = alloc_cache(io, data)))
821                 goto cleanup;
822
823 #ifdef BLKROGET
824         if (flags & IO_FLAG_RW) {
825                 int error;
826                 int readonly = 0;
827
828                 /* Is the block device actually writable? */
829                 error = ioctl(data->dev, BLKROGET, &readonly);
830                 if (!error && readonly) {
831                         retval = EPERM;
832                         goto cleanup;
833                 }
834         }
835 #endif
836
837 #ifdef __linux__
838 #undef RLIM_INFINITY
839 #if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
840 #define RLIM_INFINITY   ((unsigned long)(~0UL>>1))
841 #else
842 #define RLIM_INFINITY  (~0UL)
843 #endif
844         /*
845          * Work around a bug in 2.4.10-2.4.18 kernels where writes to
846          * block devices are wrongly getting hit by the filesize
847          * limit.  This workaround isn't perfect, since it won't work
848          * if glibc wasn't built against 2.2 header files.  (Sigh.)
849          *
850          */
851         if ((flags & IO_FLAG_RW) &&
852             (uname(&ut) == 0) &&
853             ((ut.release[0] == '2') && (ut.release[1] == '.') &&
854              (ut.release[2] == '4') && (ut.release[3] == '.') &&
855              (ut.release[4] == '1') && (ut.release[5] >= '0') &&
856              (ut.release[5] < '8')) &&
857             (ext2fs_fstat(data->dev, &st) == 0) &&
858             (ext2fsP_is_disk_device(st.st_mode))) {
859                 struct rlimit   rlim;
860
861                 rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
862                 setrlimit(RLIMIT_FSIZE, &rlim);
863                 getrlimit(RLIMIT_FSIZE, &rlim);
864                 if (((unsigned long) rlim.rlim_cur) <
865                     ((unsigned long) rlim.rlim_max)) {
866                         rlim.rlim_cur = rlim.rlim_max;
867                         setrlimit(RLIMIT_FSIZE, &rlim);
868                 }
869         }
870 #endif
871 #ifdef HAVE_PTHREAD
872         if (flags & IO_FLAG_THREADS) {
873                 io->flags |= CHANNEL_FLAGS_THREADS;
874                 retval = pthread_mutex_init(&data->cache_mutex, NULL);
875                 if (retval)
876                         goto cleanup;
877                 retval = pthread_mutex_init(&data->bounce_mutex, NULL);
878                 if (retval) {
879                         pthread_mutex_destroy(&data->cache_mutex);
880                         goto cleanup;
881                 }
882                 retval = pthread_mutex_init(&data->stats_mutex, NULL);
883                 if (retval) {
884                         pthread_mutex_destroy(&data->cache_mutex);
885                         pthread_mutex_destroy(&data->bounce_mutex);
886                         goto cleanup;
887                 }
888         }
889 #endif
890         *channel = io;
891         return 0;
892
893 cleanup:
894         if (data) {
895                 if (data->dev >= 0)
896                         close(data->dev);
897                 free_cache(data);
898                 ext2fs_free_mem(&data);
899         }
900         if (io) {
901                 if (io->name) {
902                         ext2fs_free_mem(&io->name);
903                 }
904                 ext2fs_free_mem(&io);
905         }
906         return retval;
907 }
908
909 static errcode_t unixfd_open(const char *str_fd, int flags,
910                              io_channel *channel)
911 {
912         int fd;
913         int fd_flags;
914
915         fd = atoi(str_fd);
916 #if defined(HAVE_FCNTL)
917         fd_flags = fcntl(fd, F_GETFD);
918         if (fd_flags == -1)
919                 return EBADF;
920
921         flags = 0;
922         if (fd_flags & O_RDWR)
923                 flags |= IO_FLAG_RW;
924         if (fd_flags & O_EXCL)
925                 flags |= IO_FLAG_EXCLUSIVE;
926 #if defined(O_DIRECT)
927         if (fd_flags & O_DIRECT)
928                 flags |= IO_FLAG_DIRECT_IO;
929 #endif
930 #endif  /* HAVE_FCNTL */
931
932         return unix_open_channel(str_fd, fd, flags, channel, unixfd_io_manager);
933 }
934
935 static errcode_t unix_open(const char *name, int flags,
936                            io_channel *channel)
937 {
938         int fd = -1;
939         int open_flags;
940
941         if (name == 0)
942                 return EXT2_ET_BAD_DEVICE_NAME;
943
944         open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
945         if (flags & IO_FLAG_EXCLUSIVE)
946                 open_flags |= O_EXCL;
947 #if defined(O_DIRECT)
948         if (flags & IO_FLAG_DIRECT_IO)
949                 open_flags |= O_DIRECT;
950 #endif
951         fd = ext2fs_open_file(name, open_flags, 0);
952         if (fd < 0)
953                 return errno;
954 #if defined(F_NOCACHE) && !defined(IO_DIRECT)
955         if (flags & IO_FLAG_DIRECT_IO) {
956                 if (fcntl(fd, F_NOCACHE, 1) < 0)
957                         return errno;
958         }
959 #endif
960         return unix_open_channel(name, fd, flags, channel, unix_io_manager);
961 }
962
963 static errcode_t unix_close(io_channel channel)
964 {
965         struct unix_private_data *data;
966         errcode_t       retval = 0;
967
968         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
969         data = (struct unix_private_data *) channel->private_data;
970         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
971
972         if (--channel->refcount > 0)
973                 return 0;
974
975 #ifndef NO_IO_CACHE
976         retval = flush_cached_blocks(channel, data, 0);
977 #endif
978
979         if (close(data->dev) < 0)
980                 retval = errno;
981         free_cache(data);
982 #ifdef HAVE_PTHREAD
983         if (data->flags & IO_FLAG_THREADS) {
984                 pthread_mutex_destroy(&data->cache_mutex);
985                 pthread_mutex_destroy(&data->bounce_mutex);
986                 pthread_mutex_destroy(&data->stats_mutex);
987         }
988 #endif
989
990         ext2fs_free_mem(&channel->private_data);
991         if (channel->name)
992                 ext2fs_free_mem(&channel->name);
993         ext2fs_free_mem(&channel);
994         return retval;
995 }
996
997 static errcode_t unix_set_blksize(io_channel channel, int blksize)
998 {
999         struct unix_private_data *data;
1000         errcode_t               retval = 0;
1001
1002         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1003         data = (struct unix_private_data *) channel->private_data;
1004         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1005
1006         if (channel->block_size != blksize) {
1007                 mutex_lock(data, CACHE_MTX);
1008                 mutex_lock(data, BOUNCE_MTX);
1009 #ifndef NO_IO_CACHE
1010                 if ((retval = flush_cached_blocks(channel, data, FLUSH_NOLOCK))){
1011                         mutex_unlock(data, BOUNCE_MTX);
1012                         mutex_unlock(data, CACHE_MTX);
1013                         return retval;
1014                 }
1015 #endif
1016
1017                 channel->block_size = blksize;
1018                 free_cache(data);
1019                 retval = alloc_cache(channel, data);
1020                 mutex_unlock(data, BOUNCE_MTX);
1021                 mutex_unlock(data, CACHE_MTX);
1022         }
1023         return retval;
1024 }
1025
1026 static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
1027                                int count, void *buf)
1028 {
1029         struct unix_private_data *data;
1030         struct unix_cache *cache;
1031         errcode_t       retval;
1032         char            *cp;
1033         int             i, j;
1034
1035         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1036         data = (struct unix_private_data *) channel->private_data;
1037         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1038
1039 #ifdef NO_IO_CACHE
1040         return raw_read_blk(channel, data, block, count, buf);
1041 #else
1042         if (data->flags & IO_FLAG_NOCACHE)
1043                 return raw_read_blk(channel, data, block, count, buf);
1044         /*
1045          * If we're doing an odd-sized read or a very large read,
1046          * flush out the cache and then do a direct read.
1047          */
1048         if (count < 0 || count > WRITE_DIRECT_SIZE) {
1049                 if ((retval = flush_cached_blocks(channel, data, 0)))
1050                         return retval;
1051                 return raw_read_blk(channel, data, block, count, buf);
1052         }
1053
1054         cp = buf;
1055         mutex_lock(data, CACHE_MTX);
1056         while (count > 0) {
1057                 /* If it's in the cache, use it! */
1058                 if ((cache = find_cached_block(data, block, NULL))) {
1059 #ifdef DEBUG
1060                         printf("Using cached block %lu\n", block);
1061 #endif
1062                         memcpy(cp, cache->buf, channel->block_size);
1063                         count--;
1064                         block++;
1065                         cp += channel->block_size;
1066                         continue;
1067                 }
1068
1069                 /*
1070                  * Find the number of uncached blocks so we can do a
1071                  * single read request
1072                  */
1073                 for (i=1; i < count; i++)
1074                         if (find_cached_block(data, block+i, NULL))
1075                                 break;
1076 #ifdef DEBUG
1077                 printf("Reading %d blocks starting at %lu\n", i, block);
1078 #endif
1079                 mutex_unlock(data, CACHE_MTX);
1080                 if ((retval = raw_read_blk(channel, data, block, i, cp)))
1081                         return retval;
1082                 mutex_lock(data, CACHE_MTX);
1083
1084                 /* Save the results in the cache */
1085                 for (j=0; j < i; j++) {
1086                         if (!find_cached_block(data, block, &cache)) {
1087                                 retval = reuse_cache(channel, data,
1088                                                      cache, block);
1089                                 if (retval)
1090                                         goto call_write_handler;
1091                                 memcpy(cache->buf, cp, channel->block_size);
1092                         }
1093                         count--;
1094                         block++;
1095                         cp += channel->block_size;
1096                 }
1097         }
1098         mutex_unlock(data, CACHE_MTX);
1099         return 0;
1100
1101 call_write_handler:
1102         if (cache->write_err && channel->write_error) {
1103                 char *err_buf = NULL;
1104                 unsigned long long err_block = cache->block;
1105
1106                 cache->dirty = 0;
1107                 cache->in_use = 0;
1108                 cache->write_err = 0;
1109                 if (io_channel_alloc_buf(channel, 0, &err_buf))
1110                         err_buf = NULL;
1111                 else
1112                         memcpy(err_buf, cache->buf, channel->block_size);
1113                 mutex_unlock(data, CACHE_MTX);
1114                 (channel->write_error)(channel, err_block, 1, err_buf,
1115                                        channel->block_size, -1,
1116                                        retval);
1117                 if (err_buf)
1118                         ext2fs_free_mem(&err_buf);
1119         } else
1120                 mutex_unlock(data, CACHE_MTX);
1121         return retval;
1122 #endif /* NO_IO_CACHE */
1123 }
1124
1125 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
1126                                int count, void *buf)
1127 {
1128         return unix_read_blk64(channel, block, count, buf);
1129 }
1130
1131 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
1132                                 int count, const void *buf)
1133 {
1134         struct unix_private_data *data;
1135         struct unix_cache *cache, *reuse;
1136         errcode_t       retval = 0;
1137         const char      *cp;
1138         int             writethrough;
1139
1140         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1141         data = (struct unix_private_data *) channel->private_data;
1142         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1143
1144 #ifdef NO_IO_CACHE
1145         return raw_write_blk(channel, data, block, count, buf, 0);
1146 #else
1147         if (data->flags & IO_FLAG_NOCACHE)
1148                 return raw_write_blk(channel, data, block, count, buf, 0);
1149         /*
1150          * If we're doing an odd-sized write or a very large write,
1151          * flush out the cache completely and then do a direct write.
1152          */
1153         if (count < 0 || count > WRITE_DIRECT_SIZE) {
1154                 if ((retval = flush_cached_blocks(channel, data,
1155                                                   FLUSH_INVALIDATE)))
1156                         return retval;
1157                 return raw_write_blk(channel, data, block, count, buf, 0);
1158         }
1159
1160         /*
1161          * For a moderate-sized multi-block write, first force a write
1162          * if we're in write-through cache mode, and then fill the
1163          * cache with the blocks.
1164          */
1165         writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
1166         if (writethrough)
1167                 retval = raw_write_blk(channel, data, block, count, buf, 0);
1168
1169         cp = buf;
1170         mutex_lock(data, CACHE_MTX);
1171         while (count > 0) {
1172                 cache = find_cached_block(data, block, &reuse);
1173                 if (!cache) {
1174                         errcode_t err;
1175
1176                         cache = reuse;
1177                         err = reuse_cache(channel, data, cache, block);
1178                         if (err)
1179                                 goto call_write_handler;
1180                 }
1181                 if (cache->buf != cp)
1182                         memcpy(cache->buf, cp, channel->block_size);
1183                 cache->dirty = !writethrough;
1184                 count--;
1185                 block++;
1186                 cp += channel->block_size;
1187         }
1188         mutex_unlock(data, CACHE_MTX);
1189         return retval;
1190
1191 call_write_handler:
1192         if (cache->write_err && channel->write_error) {
1193                 char *err_buf = NULL;
1194                 unsigned long long err_block = cache->block;
1195
1196                 cache->dirty = 0;
1197                 cache->in_use = 0;
1198                 cache->write_err = 0;
1199                 if (io_channel_alloc_buf(channel, 0, &err_buf))
1200                         err_buf = NULL;
1201                 else
1202                         memcpy(err_buf, cache->buf, channel->block_size);
1203                 mutex_unlock(data, CACHE_MTX);
1204                 (channel->write_error)(channel, err_block, 1, err_buf,
1205                                        channel->block_size, -1,
1206                                        retval);
1207                 if (err_buf)
1208                         ext2fs_free_mem(&err_buf);
1209         } else
1210                 mutex_unlock(data, CACHE_MTX);
1211         return retval;
1212 #endif /* NO_IO_CACHE */
1213 }
1214
1215 static errcode_t unix_cache_readahead(io_channel channel,
1216                                       unsigned long long block,
1217                                       unsigned long long count)
1218 {
1219 #ifdef POSIX_FADV_WILLNEED
1220         struct unix_private_data *data;
1221
1222         data = (struct unix_private_data *)channel->private_data;
1223         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1224         return posix_fadvise(data->dev,
1225                              (ext2_loff_t)block * channel->block_size + data->offset,
1226                              (ext2_loff_t)count * channel->block_size,
1227                              POSIX_FADV_WILLNEED);
1228 #else
1229         return EXT2_ET_OP_NOT_SUPPORTED;
1230 #endif
1231 }
1232
1233 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
1234                                 int count, const void *buf)
1235 {
1236         return unix_write_blk64(channel, block, count, buf);
1237 }
1238
1239 static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
1240                                  int size, const void *buf)
1241 {
1242         struct unix_private_data *data;
1243         errcode_t       retval = 0;
1244         ssize_t         actual;
1245
1246         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1247         data = (struct unix_private_data *) channel->private_data;
1248         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1249
1250         if (channel->align != 0) {
1251 #ifdef ALIGN_DEBUG
1252                 printf("unix_write_byte: O_DIRECT fallback\n");
1253 #endif
1254                 return EXT2_ET_UNIMPLEMENTED;
1255         }
1256
1257 #ifndef NO_IO_CACHE
1258         /*
1259          * Flush out the cache completely
1260          */
1261         if ((retval = flush_cached_blocks(channel, data, FLUSH_INVALIDATE)))
1262                 return retval;
1263 #endif
1264
1265         if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
1266                 return errno;
1267
1268         actual = write(data->dev, buf, size);
1269         if (actual < 0)
1270                 return errno;
1271         if (actual != size)
1272                 return EXT2_ET_SHORT_WRITE;
1273
1274         return 0;
1275 }
1276
1277 /*
1278  * Flush data buffers to disk.
1279  */
1280 static errcode_t unix_flush(io_channel channel)
1281 {
1282         struct unix_private_data *data;
1283         errcode_t retval = 0;
1284
1285         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1286         data = (struct unix_private_data *) channel->private_data;
1287         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1288
1289 #ifndef NO_IO_CACHE
1290         retval = flush_cached_blocks(channel, data, 0);
1291 #endif
1292 #ifdef HAVE_FSYNC
1293         if (!retval && fsync(data->dev) != 0)
1294                 return errno;
1295 #endif
1296         return retval;
1297 }
1298
1299 static errcode_t unix_set_option(io_channel channel, const char *option,
1300                                  const char *arg)
1301 {
1302         struct unix_private_data *data;
1303         unsigned long long tmp;
1304         errcode_t retval;
1305         char *end;
1306
1307         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1308         data = (struct unix_private_data *) channel->private_data;
1309         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1310
1311         if (!strcmp(option, "offset")) {
1312                 if (!arg)
1313                         return EXT2_ET_INVALID_ARGUMENT;
1314
1315                 tmp = strtoull(arg, &end, 0);
1316                 if (*end)
1317                         return EXT2_ET_INVALID_ARGUMENT;
1318                 data->offset = tmp;
1319                 if (data->offset < 0)
1320                         return EXT2_ET_INVALID_ARGUMENT;
1321                 return 0;
1322         }
1323         if (!strcmp(option, "cache")) {
1324                 if (!arg)
1325                         return EXT2_ET_INVALID_ARGUMENT;
1326                 if (!strcmp(arg, "on")) {
1327                         data->flags &= ~IO_FLAG_NOCACHE;
1328                         return 0;
1329                 }
1330                 if (!strcmp(arg, "off")) {
1331                         retval = flush_cached_blocks(channel, data, 0);
1332                         data->flags |= IO_FLAG_NOCACHE;
1333                         return retval;
1334                 }
1335                 return EXT2_ET_INVALID_ARGUMENT;
1336         }
1337         return EXT2_ET_INVALID_ARGUMENT;
1338 }
1339
1340 #if defined(__linux__) && !defined(BLKDISCARD)
1341 #define BLKDISCARD              _IO(0x12,119)
1342 #endif
1343
1344 static errcode_t unix_discard(io_channel channel, unsigned long long block,
1345                               unsigned long long count)
1346 {
1347         struct unix_private_data *data;
1348         int             ret;
1349
1350         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1351         data = (struct unix_private_data *) channel->private_data;
1352         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1353
1354         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
1355 #ifdef BLKDISCARD
1356                 __u64 range[2];
1357
1358                 range[0] = (__u64)(block) * channel->block_size + data->offset;
1359                 range[1] = (__u64)(count) * channel->block_size;
1360
1361                 ret = ioctl(data->dev, BLKDISCARD, &range);
1362 #else
1363                 goto unimplemented;
1364 #endif
1365         } else {
1366 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
1367                 /*
1368                  * If we are not on block device, try to use punch hole
1369                  * to reclaim free space.
1370                  */
1371                 ret = fallocate(data->dev,
1372                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1373                                 (off_t)(block) * channel->block_size + data->offset,
1374                                 (off_t)(count) * channel->block_size);
1375 #else
1376                 goto unimplemented;
1377 #endif
1378         }
1379         if (ret < 0) {
1380                 if (errno == EOPNOTSUPP)
1381                         goto unimplemented;
1382                 return errno;
1383         }
1384         return 0;
1385 unimplemented:
1386         return EXT2_ET_UNIMPLEMENTED;
1387 }
1388
1389 /*
1390  * If we know about ZERO_RANGE, try that before we try PUNCH_HOLE because
1391  * ZERO_RANGE doesn't unmap preallocated blocks.  We prefer fallocate because
1392  * it always invalidates page cache, and libext2fs requires that reads after
1393  * ZERO_RANGE return zeroes.
1394  */
1395 static int __unix_zeroout(int fd, off_t offset, off_t len)
1396 {
1397         int ret = -1;
1398
1399 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_ZERO_RANGE)
1400         ret = fallocate(fd, FALLOC_FL_ZERO_RANGE, offset, len);
1401         if (ret == 0)
1402                 return 0;
1403 #endif
1404 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
1405         ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1406                         offset,  len);
1407         if (ret == 0)
1408                 return 0;
1409 #endif
1410         errno = EOPNOTSUPP;
1411         return ret;
1412 }
1413
1414 /* parameters might not be used if OS doesn't support zeroout */
1415 #if __GNUC_PREREQ (4, 6)
1416 #pragma GCC diagnostic push
1417 #pragma GCC diagnostic ignored "-Wunused-parameter"
1418 #endif
1419 static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
1420                               unsigned long long count)
1421 {
1422         struct unix_private_data *data;
1423         int             ret;
1424
1425         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1426         data = (struct unix_private_data *) channel->private_data;
1427         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1428
1429         if (safe_getenv("UNIX_IO_NOZEROOUT"))
1430                 goto unimplemented;
1431
1432         if (!(channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE)) {
1433                 /* Regular file, try to use truncate/punch/zero. */
1434                 struct stat statbuf;
1435
1436                 if (count == 0)
1437                         return 0;
1438                 /*
1439                  * If we're trying to zero a range past the end of the file,
1440                  * extend the file size, then truncate everything.
1441                  */
1442                 ret = fstat(data->dev, &statbuf);
1443                 if (ret)
1444                         goto err;
1445                 if ((unsigned long long) statbuf.st_size <
1446                         (block + count) * channel->block_size + data->offset) {
1447                         ret = ftruncate(data->dev,
1448                                         (block + count) * channel->block_size + data->offset);
1449                         if (ret)
1450                                 goto err;
1451                 }
1452         }
1453
1454         ret = __unix_zeroout(data->dev,
1455                         (off_t)(block) * channel->block_size + data->offset,
1456                         (off_t)(count) * channel->block_size);
1457 err:
1458         if (ret < 0) {
1459                 if (errno == EOPNOTSUPP)
1460                         goto unimplemented;
1461                 return errno;
1462         }
1463         return 0;
1464 unimplemented:
1465         return EXT2_ET_UNIMPLEMENTED;
1466 }
1467 #if __GNUC_PREREQ (4, 6)
1468 #pragma GCC diagnostic pop
1469 #endif
1470
1471 static struct struct_io_manager struct_unix_manager = {
1472         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1473         .name           = "Unix I/O Manager",
1474         .open           = unix_open,
1475         .close          = unix_close,
1476         .set_blksize    = unix_set_blksize,
1477         .read_blk       = unix_read_blk,
1478         .write_blk      = unix_write_blk,
1479         .flush          = unix_flush,
1480         .write_byte     = unix_write_byte,
1481         .set_option     = unix_set_option,
1482         .get_stats      = unix_get_stats,
1483         .read_blk64     = unix_read_blk64,
1484         .write_blk64    = unix_write_blk64,
1485         .discard        = unix_discard,
1486         .cache_readahead        = unix_cache_readahead,
1487         .zeroout        = unix_zeroout,
1488 };
1489
1490 io_manager unix_io_manager = &struct_unix_manager;
1491
1492 static struct struct_io_manager struct_unixfd_manager = {
1493         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1494         .name           = "Unix fd I/O Manager",
1495         .open           = unixfd_open,
1496         .close          = unix_close,
1497         .set_blksize    = unix_set_blksize,
1498         .read_blk       = unix_read_blk,
1499         .write_blk      = unix_write_blk,
1500         .flush          = unix_flush,
1501         .write_byte     = unix_write_byte,
1502         .set_option     = unix_set_option,
1503         .get_stats      = unix_get_stats,
1504         .read_blk64     = unix_read_blk64,
1505         .write_blk64    = unix_write_blk64,
1506         .discard        = unix_discard,
1507         .cache_readahead        = unix_cache_readahead,
1508         .zeroout        = unix_zeroout,
1509 };
1510
1511 io_manager unixfd_io_manager = &struct_unixfd_manager;