Whamcloud - gitweb
ext2fs: don't retry discard/zeroout repeatedly
[tools/e2fsprogs.git] / lib / ext2fs / unix_io.c
1 /*
2  * unix_io.c --- This is the Unix (well, really POSIX) implementation
3  *      of the I/O manager.
4  *
5  * Implements a one-block write-through cache.
6  *
7  * Includes support for Windows NT support under Cygwin.
8  *
9  * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10  *      2002 by Theodore Ts'o.
11  *
12  * %Begin-Header%
13  * This file may be redistributed under the terms of the GNU Library
14  * General Public License, version 2.
15  * %End-Header%
16  */
17
18 #if !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
19 #define _XOPEN_SOURCE 600
20 #define _DARWIN_C_SOURCE
21 #define _FILE_OFFSET_BITS 64
22 #ifndef _LARGEFILE_SOURCE
23 #define _LARGEFILE_SOURCE
24 #endif
25 #ifndef _LARGEFILE64_SOURCE
26 #define _LARGEFILE64_SOURCE
27 #endif
28 #ifndef _GNU_SOURCE
29 #define _GNU_SOURCE
30 #endif
31 #endif
32
33 #include "config.h"
34 #include <stdio.h>
35 #include <string.h>
36 #if HAVE_UNISTD_H
37 #include <unistd.h>
38 #endif
39 #if HAVE_ERRNO_H
40 #include <errno.h>
41 #endif
42 #include <fcntl.h>
43 #include <time.h>
44 #ifdef __linux__
45 #include <sys/utsname.h>
46 #endif
47 #if HAVE_SYS_TYPES_H
48 #include <sys/types.h>
49 #endif
50 #ifdef HAVE_SYS_IOCTL_H
51 #include <sys/ioctl.h>
52 #endif
53 #ifdef HAVE_SYS_MOUNT_H
54 #include <sys/mount.h>
55 #endif
56 #ifdef HAVE_SYS_PRCTL_H
57 #include <sys/prctl.h>
58 #else
59 #define PR_GET_DUMPABLE 3
60 #endif
61 #if HAVE_SYS_STAT_H
62 #include <sys/stat.h>
63 #endif
64 #if HAVE_SYS_RESOURCE_H
65 #include <sys/resource.h>
66 #endif
67 #if HAVE_LINUX_FALLOC_H
68 #include <linux/falloc.h>
69 #endif
70 #ifdef HAVE_PTHREAD
71 #include <pthread.h>
72 #endif
73
74 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
75 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
76 #endif
77
78 #undef ALIGN_DEBUG
79
80 #include "ext2_fs.h"
81 #include "ext2fs.h"
82 #include "ext2fsP.h"
83
84 /*
85  * For checking structure magic numbers...
86  */
87
88 #define EXT2_CHECK_MAGIC(struct, code) \
89           if ((struct)->magic != (code)) return (code)
90
91 struct unix_cache {
92         char                    *buf;
93         unsigned long long      block;
94         int                     access_time;
95         unsigned                dirty:1;
96         unsigned                in_use:1;
97         unsigned                write_err:1;
98 };
99
100 #define CACHE_SIZE 8
101 #define WRITE_DIRECT_SIZE 4     /* Must be smaller than CACHE_SIZE */
102 #define READ_DIRECT_SIZE 4      /* Should be smaller than CACHE_SIZE */
103
104 struct unix_private_data {
105         int     magic;
106         int     dev;
107         int     flags;
108         int     align;
109         int     access_time;
110         ext2_loff_t offset;
111         struct unix_cache cache[CACHE_SIZE];
112         void    *bounce;
113         struct struct_io_stats io_stats;
114 #ifdef HAVE_PTHREAD
115         pthread_mutex_t cache_mutex;
116         pthread_mutex_t bounce_mutex;
117         pthread_mutex_t stats_mutex;
118 #endif
119 };
120
121 #define IS_ALIGNED(n, align) ((((uintptr_t) n) & \
122                                ((uintptr_t) ((align)-1))) == 0)
123
124 typedef enum lock_kind {
125         CACHE_MTX, BOUNCE_MTX, STATS_MTX
126 } kind_t;
127
128 #ifdef HAVE_PTHREAD
129 static inline pthread_mutex_t *get_mutex(struct unix_private_data *data,
130                                          kind_t kind)
131 {
132         if (data->flags & IO_FLAG_THREADS) {
133                 switch (kind) {
134                 case CACHE_MTX:
135                         return &data->cache_mutex;
136                 case BOUNCE_MTX:
137                         return &data->bounce_mutex;
138                 case STATS_MTX:
139                         return &data->stats_mutex;
140                 }
141         }
142         return NULL;
143 }
144 #endif
145
146 static inline void mutex_lock(struct unix_private_data *data, kind_t kind)
147 {
148 #ifdef HAVE_PTHREAD
149         pthread_mutex_t *mtx = get_mutex(data,kind);
150
151         if (mtx)
152                 pthread_mutex_lock(mtx);
153 #endif
154 }
155
156 static inline void mutex_unlock(struct unix_private_data *data, kind_t kind)
157 {
158 #ifdef HAVE_PTHREAD
159         pthread_mutex_t *mtx = get_mutex(data,kind);
160
161         if (mtx)
162                 pthread_mutex_unlock(mtx);
163 #endif
164 }
165
166 static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
167 {
168         errcode_t       retval = 0;
169
170         struct unix_private_data *data;
171
172         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
173         data = (struct unix_private_data *) channel->private_data;
174         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
175
176         if (stats) {
177                 mutex_lock(data, STATS_MTX);
178                 *stats = &data->io_stats;
179                 mutex_unlock(data, STATS_MTX);
180         }
181
182         return retval;
183 }
184
185 static char *safe_getenv(const char *arg)
186 {
187         if ((getuid() != geteuid()) || (getgid() != getegid()))
188                 return NULL;
189 #ifdef HAVE_PRCTL
190         if (prctl(PR_GET_DUMPABLE, 0, 0, 0, 0) == 0)
191                 return NULL;
192 #else
193 #if (defined(linux) && defined(SYS_prctl))
194         if (syscall(SYS_prctl, PR_GET_DUMPABLE, 0, 0, 0, 0) == 0)
195                 return NULL;
196 #endif
197 #endif
198
199 #if defined(HAVE_SECURE_GETENV)
200         return secure_getenv(arg);
201 #elif defined(HAVE___SECURE_GETENV)
202         return __secure_getenv(arg);
203 #else
204         return getenv(arg);
205 #endif
206 }
207
208 /*
209  * Here are the raw I/O functions
210  */
211 static errcode_t raw_read_blk(io_channel channel,
212                               struct unix_private_data *data,
213                               unsigned long long block,
214                               int count, void *bufv)
215 {
216         errcode_t       retval;
217         ssize_t         size;
218         ext2_loff_t     location;
219         int             actual = 0;
220         unsigned char   *buf = bufv;
221         ssize_t         really_read = 0;
222         unsigned long long aligned_blk;
223         int             align_size, offset;
224
225         size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
226         mutex_lock(data, STATS_MTX);
227         data->io_stats.bytes_read += size;
228         mutex_unlock(data, STATS_MTX);
229         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
230
231         if (data->flags & IO_FLAG_FORCE_BOUNCE)
232                 goto bounce_read;
233
234 #ifdef HAVE_PREAD64
235         /* Try an aligned pread */
236         if ((channel->align == 0) ||
237             (IS_ALIGNED(buf, channel->align) &&
238              IS_ALIGNED(location, channel->align) &&
239              IS_ALIGNED(size, channel->align))) {
240                 actual = pread64(data->dev, buf, size, location);
241                 if (actual == size)
242                         return 0;
243                 actual = 0;
244         }
245 #elif HAVE_PREAD
246         /* Try an aligned pread */
247         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
248             ((channel->align == 0) ||
249              (IS_ALIGNED(buf, channel->align) &&
250               IS_ALIGNED(location, channel->align) &&
251               IS_ALIGNED(size, channel->align)))) {
252                 actual = pread(data->dev, buf, size, location);
253                 if (actual == size)
254                         return 0;
255                 actual = 0;
256         }
257 #endif /* HAVE_PREAD */
258
259         if ((channel->align == 0) ||
260             (IS_ALIGNED(buf, channel->align) &&
261              IS_ALIGNED(location, channel->align) &&
262              IS_ALIGNED(size, channel->align))) {
263                 mutex_lock(data, BOUNCE_MTX);
264                 if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
265                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
266                         goto error_unlock;
267                 }
268                 actual = read(data->dev, buf, size);
269                 if (actual != size) {
270                 short_read:
271                         if (actual < 0) {
272                                 retval = errno;
273                                 actual = 0;
274                         } else
275                                 retval = EXT2_ET_SHORT_READ;
276                         goto error_unlock;
277                 }
278                 goto success_unlock;
279         }
280
281 #ifdef ALIGN_DEBUG
282         printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
283                (unsigned long) size);
284 #endif
285
286         /*
287          * The buffer or size which we're trying to read isn't aligned
288          * to the O_DIRECT rules, so we need to do this the hard way...
289          */
290 bounce_read:
291         if (channel->align == 0)
292                 channel->align = 1;
293         if ((channel->block_size > channel->align) &&
294             (channel->block_size % channel->align) == 0)
295                 align_size = channel->block_size;
296         else
297                 align_size = channel->align;
298         aligned_blk = location / align_size;
299         offset = location % align_size;
300
301         mutex_lock(data, BOUNCE_MTX);
302         if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
303                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
304                 goto error_unlock;
305         }
306         while (size > 0) {
307                 actual = read(data->dev, data->bounce, align_size);
308                 if (actual != align_size) {
309                         actual = really_read;
310                         buf -= really_read;
311                         size += really_read;
312                         goto short_read;
313                 }
314                 if ((actual + offset) > align_size)
315                         actual = align_size - offset;
316                 if (actual > size)
317                         actual = size;
318                 memcpy(buf, (char *)data->bounce + offset, actual);
319
320                 really_read += actual;
321                 size -= actual;
322                 buf += actual;
323                 offset = 0;
324                 aligned_blk++;
325         }
326 success_unlock:
327         mutex_unlock(data, BOUNCE_MTX);
328         return 0;
329
330 error_unlock:
331         mutex_unlock(data, BOUNCE_MTX);
332         if (actual >= 0 && actual < size)
333                 memset((char *) buf+actual, 0, size-actual);
334         if (channel->read_error)
335                 retval = (channel->read_error)(channel, block, count, buf,
336                                                size, actual, retval);
337         return retval;
338 }
339
340 #define RAW_WRITE_NO_HANDLER    1
341
342 static errcode_t raw_write_blk(io_channel channel,
343                                struct unix_private_data *data,
344                                unsigned long long block,
345                                int count, const void *bufv,
346                                int flags)
347 {
348         ssize_t         size;
349         ext2_loff_t     location;
350         int             actual = 0;
351         errcode_t       retval;
352         const unsigned char *buf = bufv;
353         unsigned long long aligned_blk;
354         int             align_size, offset;
355
356         if (count == 1)
357                 size = channel->block_size;
358         else {
359                 if (count < 0)
360                         size = -count;
361                 else
362                         size = (ext2_loff_t) count * channel->block_size;
363         }
364         mutex_lock(data, STATS_MTX);
365         data->io_stats.bytes_written += size;
366         mutex_unlock(data, STATS_MTX);
367
368         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
369
370         if (data->flags & IO_FLAG_FORCE_BOUNCE)
371                 goto bounce_write;
372
373 #ifdef HAVE_PWRITE64
374         /* Try an aligned pwrite */
375         if ((channel->align == 0) ||
376             (IS_ALIGNED(buf, channel->align) &&
377              IS_ALIGNED(location, channel->align) &&
378              IS_ALIGNED(size, channel->align))) {
379                 actual = pwrite64(data->dev, buf, size, location);
380                 if (actual == size)
381                         return 0;
382         }
383 #elif HAVE_PWRITE
384         /* Try an aligned pwrite */
385         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
386             ((channel->align == 0) ||
387              (IS_ALIGNED(buf, channel->align) &&
388               IS_ALIGNED(location, channel->align) &&
389               IS_ALIGNED(size, channel->align)))) {
390                 actual = pwrite(data->dev, buf, size, location);
391                 if (actual == size)
392                         return 0;
393         }
394 #endif /* HAVE_PWRITE */
395
396         if ((channel->align == 0) ||
397             (IS_ALIGNED(buf, channel->align) &&
398              IS_ALIGNED(location, channel->align) &&
399              IS_ALIGNED(size, channel->align))) {
400                 mutex_lock(data, BOUNCE_MTX);
401                 if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
402                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
403                         goto error_unlock;
404                 }
405                 actual = write(data->dev, buf, size);
406                 mutex_unlock(data, BOUNCE_MTX);
407                 if (actual < 0) {
408                         retval = errno;
409                         goto error_out;
410                 }
411                 if (actual != size) {
412                 short_write:
413                         retval = EXT2_ET_SHORT_WRITE;
414                         goto error_out;
415                 }
416                 return 0;
417         }
418
419 #ifdef ALIGN_DEBUG
420         printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
421                (unsigned long) size);
422 #endif
423         /*
424          * The buffer or size which we're trying to write isn't aligned
425          * to the O_DIRECT rules, so we need to do this the hard way...
426          */
427 bounce_write:
428         if (channel->align == 0)
429                 channel->align = 1;
430         if ((channel->block_size > channel->align) &&
431             (channel->block_size % channel->align) == 0)
432                 align_size = channel->block_size;
433         else
434                 align_size = channel->align;
435         aligned_blk = location / align_size;
436         offset = location % align_size;
437
438         while (size > 0) {
439                 int actual_w;
440
441                 mutex_lock(data, BOUNCE_MTX);
442                 if (size < align_size || offset) {
443                         if (ext2fs_llseek(data->dev, aligned_blk * align_size,
444                                           SEEK_SET) < 0) {
445                                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
446                                 goto error_unlock;
447                         }
448                         actual = read(data->dev, data->bounce,
449                                       align_size);
450                         if (actual != align_size) {
451                                 if (actual < 0) {
452                                         retval = errno;
453                                         goto error_unlock;
454                                 }
455                                 memset((char *) data->bounce + actual, 0,
456                                        align_size - actual);
457                         }
458                 }
459                 actual = size;
460                 if ((actual + offset) > align_size)
461                         actual = align_size - offset;
462                 if (actual > size)
463                         actual = size;
464                 memcpy(((char *)data->bounce) + offset, buf, actual);
465                 if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
466                         retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
467                         goto error_unlock;
468                 }
469                 actual_w = write(data->dev, data->bounce, align_size);
470                 mutex_unlock(data, BOUNCE_MTX);
471                 if (actual_w < 0) {
472                         retval = errno;
473                         goto error_out;
474                 }
475                 if (actual_w != align_size)
476                         goto short_write;
477                 size -= actual;
478                 buf += actual;
479                 location += actual;
480                 aligned_blk++;
481                 offset = 0;
482         }
483         return 0;
484
485 error_unlock:
486         mutex_unlock(data, BOUNCE_MTX);
487 error_out:
488         if (((flags & RAW_WRITE_NO_HANDLER) == 0) && channel->write_error)
489                 retval = (channel->write_error)(channel, block, count, buf,
490                                                 size, actual, retval);
491         return retval;
492 }
493
494
495 /*
496  * Here we implement the cache functions
497  */
498
499 /* Allocate the cache buffers */
500 static errcode_t alloc_cache(io_channel channel,
501                              struct unix_private_data *data)
502 {
503         errcode_t               retval;
504         struct unix_cache       *cache;
505         int                     i;
506
507         data->access_time = 0;
508         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
509                 cache->block = 0;
510                 cache->access_time = 0;
511                 cache->dirty = 0;
512                 cache->in_use = 0;
513                 if (cache->buf)
514                         ext2fs_free_mem(&cache->buf);
515                 retval = io_channel_alloc_buf(channel, 0, &cache->buf);
516                 if (retval)
517                         return retval;
518         }
519         if (channel->align || data->flags & IO_FLAG_FORCE_BOUNCE) {
520                 if (data->bounce)
521                         ext2fs_free_mem(&data->bounce);
522                 retval = io_channel_alloc_buf(channel, 0, &data->bounce);
523         }
524         return retval;
525 }
526
527 /* Free the cache buffers */
528 static void free_cache(struct unix_private_data *data)
529 {
530         struct unix_cache       *cache;
531         int                     i;
532
533         data->access_time = 0;
534         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
535                 cache->block = 0;
536                 cache->access_time = 0;
537                 cache->dirty = 0;
538                 cache->in_use = 0;
539                 if (cache->buf)
540                         ext2fs_free_mem(&cache->buf);
541         }
542         if (data->bounce)
543                 ext2fs_free_mem(&data->bounce);
544 }
545
546 #ifndef NO_IO_CACHE
547 /*
548  * Try to find a block in the cache.  If the block is not found, and
549  * eldest is a non-zero pointer, then fill in eldest with the cache
550  * entry to that should be reused.
551  */
552 static struct unix_cache *find_cached_block(struct unix_private_data *data,
553                                             unsigned long long block,
554                                             struct unix_cache **eldest)
555 {
556         struct unix_cache       *cache, *unused_cache, *oldest_cache;
557         int                     i;
558
559         unused_cache = oldest_cache = 0;
560         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
561                 if (!cache->in_use) {
562                         if (!unused_cache)
563                                 unused_cache = cache;
564                         continue;
565                 }
566                 if (cache->block == block) {
567                         cache->access_time = ++data->access_time;
568                         return cache;
569                 }
570                 if (!oldest_cache ||
571                     (cache->access_time < oldest_cache->access_time))
572                         oldest_cache = cache;
573         }
574         if (eldest)
575                 *eldest = (unused_cache) ? unused_cache : oldest_cache;
576         return 0;
577 }
578
579 /*
580  * Reuse a particular cache entry for another block.
581  */
582 static errcode_t reuse_cache(io_channel channel,
583                 struct unix_private_data *data, struct unix_cache *cache,
584                 unsigned long long block)
585 {
586         if (cache->dirty && cache->in_use) {
587                 errcode_t retval;
588
589                 retval = raw_write_blk(channel, data, cache->block, 1,
590                                        cache->buf, RAW_WRITE_NO_HANDLER);
591                 if (retval) {
592                         cache->write_err = 1;
593                         return retval;
594                 }
595         }
596
597         cache->in_use = 1;
598         cache->dirty = 0;
599         cache->write_err = 0;
600         cache->block = block;
601         cache->access_time = ++data->access_time;
602         return 0;
603 }
604
605 #define FLUSH_INVALIDATE        0x01
606 #define FLUSH_NOLOCK            0x02
607
608 /*
609  * Flush all of the blocks in the cache
610  */
611 static errcode_t flush_cached_blocks(io_channel channel,
612                                      struct unix_private_data *data,
613                                      int flags)
614 {
615         struct unix_cache       *cache;
616         errcode_t               retval, retval2 = 0;
617         int                     i;
618         int                     errors_found = 0;
619
620         if ((flags & FLUSH_NOLOCK) == 0)
621                 mutex_lock(data, CACHE_MTX);
622         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
623                 if (!cache->in_use || !cache->dirty)
624                         continue;
625                 retval = raw_write_blk(channel, data,
626                                        cache->block, 1, cache->buf,
627                                        RAW_WRITE_NO_HANDLER);
628                 if (retval) {
629                         cache->write_err = 1;
630                         errors_found = 1;
631                         retval2 = retval;
632                 } else {
633                         cache->dirty = 0;
634                         cache->write_err = 0;
635                         if (flags & FLUSH_INVALIDATE)
636                                 cache->in_use = 0;
637                 }
638         }
639         if ((flags & FLUSH_NOLOCK) == 0)
640                 mutex_unlock(data, CACHE_MTX);
641 retry:
642         while (errors_found) {
643                 if ((flags & FLUSH_NOLOCK) == 0)
644                         mutex_lock(data, CACHE_MTX);
645                 errors_found = 0;
646                 for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
647                         if (!cache->in_use || !cache->write_err)
648                                 continue;
649                         errors_found = 1;
650                         if (cache->write_err && channel->write_error) {
651                                 char *err_buf = NULL;
652                                 unsigned long long err_block = cache->block;
653
654                                 cache->dirty = 0;
655                                 cache->in_use = 0;
656                                 cache->write_err = 0;
657                                 if (io_channel_alloc_buf(channel, 0,
658                                                          &err_buf))
659                                         err_buf = NULL;
660                                 else
661                                         memcpy(err_buf, cache->buf,
662                                                channel->block_size);
663                                 mutex_unlock(data, CACHE_MTX);
664                                 (channel->write_error)(channel, err_block,
665                                         1, err_buf, channel->block_size, -1,
666                                         retval2);
667                                 if (err_buf)
668                                         ext2fs_free_mem(&err_buf);
669                                 goto retry;
670                         } else
671                                 cache->write_err = 0;
672                 }
673                 if ((flags & FLUSH_NOLOCK) == 0)
674                         mutex_unlock(data, CACHE_MTX);
675         }
676         return retval2;
677 }
678 #endif /* NO_IO_CACHE */
679
680 #ifdef __linux__
681 #ifndef BLKDISCARDZEROES
682 #define BLKDISCARDZEROES _IO(0x12,124)
683 #endif
684 #endif
685
686 int ext2fs_open_file(const char *pathname, int flags, mode_t mode)
687 {
688         if (mode)
689 #if defined(HAVE_OPEN64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
690                 return open64(pathname, flags, mode);
691         else
692                 return open64(pathname, flags);
693 #else
694                 return open(pathname, flags, mode);
695         else
696                 return open(pathname, flags);
697 #endif
698 }
699
700 int ext2fs_stat(const char *path, ext2fs_struct_stat *buf)
701 {
702 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
703         return stat64(path, buf);
704 #else
705         return stat(path, buf);
706 #endif
707 }
708
709 int ext2fs_fstat(int fd, ext2fs_struct_stat *buf)
710 {
711 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
712         return fstat64(fd, buf);
713 #else
714         return fstat(fd, buf);
715 #endif
716 }
717
718
719 static errcode_t unix_open_channel(const char *name, int fd,
720                                    int flags, io_channel *channel,
721                                    io_manager io_mgr)
722 {
723         io_channel      io = NULL;
724         struct unix_private_data *data = NULL;
725         errcode_t       retval;
726         ext2fs_struct_stat st;
727 #ifdef __linux__
728         struct          utsname ut;
729 #endif
730
731         if (safe_getenv("UNIX_IO_FORCE_BOUNCE"))
732                 flags |= IO_FLAG_FORCE_BOUNCE;
733
734 #ifdef __linux__
735         /*
736          * We need to make sure any previous errors in the block
737          * device are thrown away, sigh.
738          */
739         (void) fsync(fd);
740 #endif
741
742         retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
743         if (retval)
744                 goto cleanup;
745         memset(io, 0, sizeof(struct struct_io_channel));
746         io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
747         retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
748         if (retval)
749                 goto cleanup;
750
751         io->manager = io_mgr;
752         retval = ext2fs_get_mem(strlen(name)+1, &io->name);
753         if (retval)
754                 goto cleanup;
755
756         strcpy(io->name, name);
757         io->private_data = data;
758         io->block_size = 1024;
759         io->read_error = 0;
760         io->write_error = 0;
761         io->refcount = 1;
762         io->flags = 0;
763
764         if (safe_getenv("UNIX_IO_NOZEROOUT"))
765                 io->flags |= CHANNEL_FLAGS_NOZEROOUT;
766
767         memset(data, 0, sizeof(struct unix_private_data));
768         data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
769         data->io_stats.num_fields = 2;
770         data->flags = flags;
771         data->dev = fd;
772
773 #if defined(O_DIRECT)
774         if (flags & IO_FLAG_DIRECT_IO)
775                 io->align = ext2fs_get_dio_alignment(data->dev);
776 #elif defined(F_NOCACHE)
777         if (flags & IO_FLAG_DIRECT_IO)
778                 io->align = 4096;
779 #endif
780
781         /*
782          * If the device is really a block device, then set the
783          * appropriate flag, otherwise we can set DISCARD_ZEROES flag
784          * because we are going to use punch hole instead of discard
785          * and if it succeed, subsequent read from sparse area returns
786          * zero.
787          */
788         if (ext2fs_fstat(data->dev, &st) == 0) {
789                 if (ext2fsP_is_disk_device(st.st_mode)) {
790 #ifdef BLKDISCARDZEROES
791                         int zeroes = 0;
792
793                         if (ioctl(data->dev, BLKDISCARDZEROES, &zeroes) == 0 &&
794                             zeroes)
795                                 io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
796 #endif
797                         io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
798                 } else {
799                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
800                 }
801         }
802
803 #if defined(__CYGWIN__)
804         /*
805          * Some operating systems require that the buffers be aligned,
806          * regardless of O_DIRECT
807          */
808         if (!io->align)
809                 io->align = 512;
810 #endif
811
812 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
813         if (io->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
814                 int dio_align = ext2fs_get_dio_alignment(fd);
815
816                 if (io->align < dio_align)
817                         io->align = dio_align;
818         }
819 #endif
820
821         if ((retval = alloc_cache(io, data)))
822                 goto cleanup;
823
824 #ifdef BLKROGET
825         if (flags & IO_FLAG_RW) {
826                 int error;
827                 int readonly = 0;
828
829                 /* Is the block device actually writable? */
830                 error = ioctl(data->dev, BLKROGET, &readonly);
831                 if (!error && readonly) {
832                         retval = EPERM;
833                         goto cleanup;
834                 }
835         }
836 #endif
837
838 #ifdef __linux__
839 #undef RLIM_INFINITY
840 #if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
841 #define RLIM_INFINITY   ((unsigned long)(~0UL>>1))
842 #else
843 #define RLIM_INFINITY  (~0UL)
844 #endif
845         /*
846          * Work around a bug in 2.4.10-2.4.18 kernels where writes to
847          * block devices are wrongly getting hit by the filesize
848          * limit.  This workaround isn't perfect, since it won't work
849          * if glibc wasn't built against 2.2 header files.  (Sigh.)
850          *
851          */
852         if ((flags & IO_FLAG_RW) &&
853             (uname(&ut) == 0) &&
854             ((ut.release[0] == '2') && (ut.release[1] == '.') &&
855              (ut.release[2] == '4') && (ut.release[3] == '.') &&
856              (ut.release[4] == '1') && (ut.release[5] >= '0') &&
857              (ut.release[5] < '8')) &&
858             (ext2fs_fstat(data->dev, &st) == 0) &&
859             (ext2fsP_is_disk_device(st.st_mode))) {
860                 struct rlimit   rlim;
861
862                 rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
863                 setrlimit(RLIMIT_FSIZE, &rlim);
864                 getrlimit(RLIMIT_FSIZE, &rlim);
865                 if (((unsigned long) rlim.rlim_cur) <
866                     ((unsigned long) rlim.rlim_max)) {
867                         rlim.rlim_cur = rlim.rlim_max;
868                         setrlimit(RLIMIT_FSIZE, &rlim);
869                 }
870         }
871 #endif
872 #ifdef HAVE_PTHREAD
873         if (flags & IO_FLAG_THREADS) {
874                 io->flags |= CHANNEL_FLAGS_THREADS;
875                 retval = pthread_mutex_init(&data->cache_mutex, NULL);
876                 if (retval)
877                         goto cleanup;
878                 retval = pthread_mutex_init(&data->bounce_mutex, NULL);
879                 if (retval) {
880                         pthread_mutex_destroy(&data->cache_mutex);
881                         goto cleanup;
882                 }
883                 retval = pthread_mutex_init(&data->stats_mutex, NULL);
884                 if (retval) {
885                         pthread_mutex_destroy(&data->cache_mutex);
886                         pthread_mutex_destroy(&data->bounce_mutex);
887                         goto cleanup;
888                 }
889         }
890 #endif
891         *channel = io;
892         return 0;
893
894 cleanup:
895         if (data) {
896                 if (data->dev >= 0)
897                         close(data->dev);
898                 free_cache(data);
899                 ext2fs_free_mem(&data);
900         }
901         if (io) {
902                 if (io->name) {
903                         ext2fs_free_mem(&io->name);
904                 }
905                 ext2fs_free_mem(&io);
906         }
907         return retval;
908 }
909
910 static errcode_t unixfd_open(const char *str_fd, int flags,
911                              io_channel *channel)
912 {
913         int fd;
914         int fd_flags;
915
916         fd = atoi(str_fd);
917 #if defined(HAVE_FCNTL)
918         fd_flags = fcntl(fd, F_GETFD);
919         if (fd_flags == -1)
920                 return EBADF;
921
922         flags = 0;
923         if (fd_flags & O_RDWR)
924                 flags |= IO_FLAG_RW;
925         if (fd_flags & O_EXCL)
926                 flags |= IO_FLAG_EXCLUSIVE;
927 #if defined(O_DIRECT)
928         if (fd_flags & O_DIRECT)
929                 flags |= IO_FLAG_DIRECT_IO;
930 #endif
931 #endif  /* HAVE_FCNTL */
932
933         return unix_open_channel(str_fd, fd, flags, channel, unixfd_io_manager);
934 }
935
936 static errcode_t unix_open(const char *name, int flags,
937                            io_channel *channel)
938 {
939         int fd = -1;
940         int open_flags;
941
942         if (name == 0)
943                 return EXT2_ET_BAD_DEVICE_NAME;
944
945         open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
946         if (flags & IO_FLAG_EXCLUSIVE)
947                 open_flags |= O_EXCL;
948 #if defined(O_DIRECT)
949         if (flags & IO_FLAG_DIRECT_IO)
950                 open_flags |= O_DIRECT;
951 #endif
952         fd = ext2fs_open_file(name, open_flags, 0);
953         if (fd < 0)
954                 return errno;
955 #if defined(F_NOCACHE) && !defined(IO_DIRECT)
956         if (flags & IO_FLAG_DIRECT_IO) {
957                 if (fcntl(fd, F_NOCACHE, 1) < 0)
958                         return errno;
959         }
960 #endif
961         return unix_open_channel(name, fd, flags, channel, unix_io_manager);
962 }
963
964 static errcode_t unix_close(io_channel channel)
965 {
966         struct unix_private_data *data;
967         errcode_t       retval = 0;
968
969         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
970         data = (struct unix_private_data *) channel->private_data;
971         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
972
973         if (--channel->refcount > 0)
974                 return 0;
975
976 #ifndef NO_IO_CACHE
977         retval = flush_cached_blocks(channel, data, 0);
978 #endif
979
980         if (close(data->dev) < 0)
981                 retval = errno;
982         free_cache(data);
983 #ifdef HAVE_PTHREAD
984         if (data->flags & IO_FLAG_THREADS) {
985                 pthread_mutex_destroy(&data->cache_mutex);
986                 pthread_mutex_destroy(&data->bounce_mutex);
987                 pthread_mutex_destroy(&data->stats_mutex);
988         }
989 #endif
990
991         ext2fs_free_mem(&channel->private_data);
992         if (channel->name)
993                 ext2fs_free_mem(&channel->name);
994         ext2fs_free_mem(&channel);
995         return retval;
996 }
997
998 static errcode_t unix_set_blksize(io_channel channel, int blksize)
999 {
1000         struct unix_private_data *data;
1001         errcode_t               retval = 0;
1002
1003         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1004         data = (struct unix_private_data *) channel->private_data;
1005         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1006
1007         if (channel->block_size != blksize) {
1008                 mutex_lock(data, CACHE_MTX);
1009                 mutex_lock(data, BOUNCE_MTX);
1010 #ifndef NO_IO_CACHE
1011                 if ((retval = flush_cached_blocks(channel, data, FLUSH_NOLOCK))){
1012                         mutex_unlock(data, BOUNCE_MTX);
1013                         mutex_unlock(data, CACHE_MTX);
1014                         return retval;
1015                 }
1016 #endif
1017
1018                 channel->block_size = blksize;
1019                 free_cache(data);
1020                 retval = alloc_cache(channel, data);
1021                 mutex_unlock(data, BOUNCE_MTX);
1022                 mutex_unlock(data, CACHE_MTX);
1023         }
1024         return retval;
1025 }
1026
1027 static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
1028                                int count, void *buf)
1029 {
1030         struct unix_private_data *data;
1031         struct unix_cache *cache;
1032         errcode_t       retval;
1033         char            *cp;
1034         int             i, j;
1035
1036         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1037         data = (struct unix_private_data *) channel->private_data;
1038         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1039
1040 #ifdef NO_IO_CACHE
1041         return raw_read_blk(channel, data, block, count, buf);
1042 #else
1043         if (data->flags & IO_FLAG_NOCACHE)
1044                 return raw_read_blk(channel, data, block, count, buf);
1045         /*
1046          * If we're doing an odd-sized read or a very large read,
1047          * flush out the cache and then do a direct read.
1048          */
1049         if (count < 0 || count > WRITE_DIRECT_SIZE) {
1050                 if ((retval = flush_cached_blocks(channel, data, 0)))
1051                         return retval;
1052                 return raw_read_blk(channel, data, block, count, buf);
1053         }
1054
1055         cp = buf;
1056         mutex_lock(data, CACHE_MTX);
1057         while (count > 0) {
1058                 /* If it's in the cache, use it! */
1059                 if ((cache = find_cached_block(data, block, NULL))) {
1060 #ifdef DEBUG
1061                         printf("Using cached block %lu\n", block);
1062 #endif
1063                         memcpy(cp, cache->buf, channel->block_size);
1064                         count--;
1065                         block++;
1066                         cp += channel->block_size;
1067                         continue;
1068                 }
1069
1070                 /*
1071                  * Find the number of uncached blocks so we can do a
1072                  * single read request
1073                  */
1074                 for (i=1; i < count; i++)
1075                         if (find_cached_block(data, block+i, NULL))
1076                                 break;
1077 #ifdef DEBUG
1078                 printf("Reading %d blocks starting at %lu\n", i, block);
1079 #endif
1080                 mutex_unlock(data, CACHE_MTX);
1081                 if ((retval = raw_read_blk(channel, data, block, i, cp)))
1082                         return retval;
1083                 mutex_lock(data, CACHE_MTX);
1084
1085                 /* Save the results in the cache */
1086                 for (j=0; j < i; j++) {
1087                         if (!find_cached_block(data, block, &cache)) {
1088                                 retval = reuse_cache(channel, data,
1089                                                      cache, block);
1090                                 if (retval)
1091                                         goto call_write_handler;
1092                                 memcpy(cache->buf, cp, channel->block_size);
1093                         }
1094                         count--;
1095                         block++;
1096                         cp += channel->block_size;
1097                 }
1098         }
1099         mutex_unlock(data, CACHE_MTX);
1100         return 0;
1101
1102 call_write_handler:
1103         if (cache->write_err && channel->write_error) {
1104                 char *err_buf = NULL;
1105                 unsigned long long err_block = cache->block;
1106
1107                 cache->dirty = 0;
1108                 cache->in_use = 0;
1109                 cache->write_err = 0;
1110                 if (io_channel_alloc_buf(channel, 0, &err_buf))
1111                         err_buf = NULL;
1112                 else
1113                         memcpy(err_buf, cache->buf, channel->block_size);
1114                 mutex_unlock(data, CACHE_MTX);
1115                 (channel->write_error)(channel, err_block, 1, err_buf,
1116                                        channel->block_size, -1,
1117                                        retval);
1118                 if (err_buf)
1119                         ext2fs_free_mem(&err_buf);
1120         } else
1121                 mutex_unlock(data, CACHE_MTX);
1122         return retval;
1123 #endif /* NO_IO_CACHE */
1124 }
1125
1126 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
1127                                int count, void *buf)
1128 {
1129         return unix_read_blk64(channel, block, count, buf);
1130 }
1131
1132 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
1133                                 int count, const void *buf)
1134 {
1135         struct unix_private_data *data;
1136         struct unix_cache *cache, *reuse;
1137         errcode_t       retval = 0;
1138         const char      *cp;
1139         int             writethrough;
1140
1141         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1142         data = (struct unix_private_data *) channel->private_data;
1143         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1144
1145 #ifdef NO_IO_CACHE
1146         return raw_write_blk(channel, data, block, count, buf, 0);
1147 #else
1148         if (data->flags & IO_FLAG_NOCACHE)
1149                 return raw_write_blk(channel, data, block, count, buf, 0);
1150         /*
1151          * If we're doing an odd-sized write or a very large write,
1152          * flush out the cache completely and then do a direct write.
1153          */
1154         if (count < 0 || count > WRITE_DIRECT_SIZE) {
1155                 if ((retval = flush_cached_blocks(channel, data,
1156                                                   FLUSH_INVALIDATE)))
1157                         return retval;
1158                 return raw_write_blk(channel, data, block, count, buf, 0);
1159         }
1160
1161         /*
1162          * For a moderate-sized multi-block write, first force a write
1163          * if we're in write-through cache mode, and then fill the
1164          * cache with the blocks.
1165          */
1166         writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
1167         if (writethrough)
1168                 retval = raw_write_blk(channel, data, block, count, buf, 0);
1169
1170         cp = buf;
1171         mutex_lock(data, CACHE_MTX);
1172         while (count > 0) {
1173                 cache = find_cached_block(data, block, &reuse);
1174                 if (!cache) {
1175                         errcode_t err;
1176
1177                         cache = reuse;
1178                         err = reuse_cache(channel, data, cache, block);
1179                         if (err)
1180                                 goto call_write_handler;
1181                 }
1182                 if (cache->buf != cp)
1183                         memcpy(cache->buf, cp, channel->block_size);
1184                 cache->dirty = !writethrough;
1185                 count--;
1186                 block++;
1187                 cp += channel->block_size;
1188         }
1189         mutex_unlock(data, CACHE_MTX);
1190         return retval;
1191
1192 call_write_handler:
1193         if (cache->write_err && channel->write_error) {
1194                 char *err_buf = NULL;
1195                 unsigned long long err_block = cache->block;
1196
1197                 cache->dirty = 0;
1198                 cache->in_use = 0;
1199                 cache->write_err = 0;
1200                 if (io_channel_alloc_buf(channel, 0, &err_buf))
1201                         err_buf = NULL;
1202                 else
1203                         memcpy(err_buf, cache->buf, channel->block_size);
1204                 mutex_unlock(data, CACHE_MTX);
1205                 (channel->write_error)(channel, err_block, 1, err_buf,
1206                                        channel->block_size, -1,
1207                                        retval);
1208                 if (err_buf)
1209                         ext2fs_free_mem(&err_buf);
1210         } else
1211                 mutex_unlock(data, CACHE_MTX);
1212         return retval;
1213 #endif /* NO_IO_CACHE */
1214 }
1215
1216 static errcode_t unix_cache_readahead(io_channel channel,
1217                                       unsigned long long block,
1218                                       unsigned long long count)
1219 {
1220 #ifdef POSIX_FADV_WILLNEED
1221         struct unix_private_data *data;
1222
1223         data = (struct unix_private_data *)channel->private_data;
1224         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1225         return posix_fadvise(data->dev,
1226                              (ext2_loff_t)block * channel->block_size + data->offset,
1227                              (ext2_loff_t)count * channel->block_size,
1228                              POSIX_FADV_WILLNEED);
1229 #else
1230         return EXT2_ET_OP_NOT_SUPPORTED;
1231 #endif
1232 }
1233
1234 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
1235                                 int count, const void *buf)
1236 {
1237         return unix_write_blk64(channel, block, count, buf);
1238 }
1239
1240 static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
1241                                  int size, const void *buf)
1242 {
1243         struct unix_private_data *data;
1244         errcode_t       retval = 0;
1245         ssize_t         actual;
1246
1247         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1248         data = (struct unix_private_data *) channel->private_data;
1249         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1250
1251         if (channel->align != 0) {
1252 #ifdef ALIGN_DEBUG
1253                 printf("unix_write_byte: O_DIRECT fallback\n");
1254 #endif
1255                 return EXT2_ET_UNIMPLEMENTED;
1256         }
1257
1258 #ifndef NO_IO_CACHE
1259         /*
1260          * Flush out the cache completely
1261          */
1262         if ((retval = flush_cached_blocks(channel, data, FLUSH_INVALIDATE)))
1263                 return retval;
1264 #endif
1265
1266         if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
1267                 return errno;
1268
1269         actual = write(data->dev, buf, size);
1270         if (actual < 0)
1271                 return errno;
1272         if (actual != size)
1273                 return EXT2_ET_SHORT_WRITE;
1274
1275         return 0;
1276 }
1277
1278 /*
1279  * Flush data buffers to disk.
1280  */
1281 static errcode_t unix_flush(io_channel channel)
1282 {
1283         struct unix_private_data *data;
1284         errcode_t retval = 0;
1285
1286         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1287         data = (struct unix_private_data *) channel->private_data;
1288         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1289
1290 #ifndef NO_IO_CACHE
1291         retval = flush_cached_blocks(channel, data, 0);
1292 #endif
1293 #ifdef HAVE_FSYNC
1294         if (!retval && fsync(data->dev) != 0)
1295                 return errno;
1296 #endif
1297         return retval;
1298 }
1299
1300 static errcode_t unix_set_option(io_channel channel, const char *option,
1301                                  const char *arg)
1302 {
1303         struct unix_private_data *data;
1304         unsigned long long tmp;
1305         errcode_t retval;
1306         char *end;
1307
1308         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1309         data = (struct unix_private_data *) channel->private_data;
1310         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1311
1312         if (!strcmp(option, "offset")) {
1313                 if (!arg)
1314                         return EXT2_ET_INVALID_ARGUMENT;
1315
1316                 tmp = strtoull(arg, &end, 0);
1317                 if (*end)
1318                         return EXT2_ET_INVALID_ARGUMENT;
1319                 data->offset = tmp;
1320                 if (data->offset < 0)
1321                         return EXT2_ET_INVALID_ARGUMENT;
1322                 return 0;
1323         }
1324         if (!strcmp(option, "cache")) {
1325                 if (!arg)
1326                         return EXT2_ET_INVALID_ARGUMENT;
1327                 if (!strcmp(arg, "on")) {
1328                         data->flags &= ~IO_FLAG_NOCACHE;
1329                         return 0;
1330                 }
1331                 if (!strcmp(arg, "off")) {
1332                         retval = flush_cached_blocks(channel, data, 0);
1333                         data->flags |= IO_FLAG_NOCACHE;
1334                         return retval;
1335                 }
1336                 return EXT2_ET_INVALID_ARGUMENT;
1337         }
1338         return EXT2_ET_INVALID_ARGUMENT;
1339 }
1340
1341 #if defined(__linux__) && !defined(BLKDISCARD)
1342 #define BLKDISCARD              _IO(0x12,119)
1343 #endif
1344
1345 static errcode_t unix_discard(io_channel channel, unsigned long long block,
1346                               unsigned long long count)
1347 {
1348         struct unix_private_data *data;
1349         int             ret = EOPNOTSUPP;
1350
1351         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1352         data = (struct unix_private_data *) channel->private_data;
1353         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1354
1355         if (channel->flags & CHANNEL_FLAGS_NODISCARD)
1356                 goto unimplemented;
1357
1358         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
1359 #ifdef BLKDISCARD
1360                 __u64 range[2];
1361
1362                 range[0] = (__u64)(block) * channel->block_size + data->offset;
1363                 range[1] = (__u64)(count) * channel->block_size;
1364
1365                 ret = ioctl(data->dev, BLKDISCARD, &range);
1366 #else
1367                 goto unimplemented;
1368 #endif
1369         } else {
1370 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
1371                 /*
1372                  * If we are not on block device, try to use punch hole
1373                  * to reclaim free space.
1374                  */
1375                 ret = fallocate(data->dev,
1376                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1377                                 (off_t)(block) * channel->block_size + data->offset,
1378                                 (off_t)(count) * channel->block_size);
1379 #else
1380                 goto unimplemented;
1381 #endif
1382         }
1383         if (ret < 0) {
1384                 if (errno == EOPNOTSUPP) {
1385                         channel->flags |= CHANNEL_FLAGS_NODISCARD;
1386                         goto unimplemented;
1387                 }
1388                 return errno;
1389         }
1390         return 0;
1391 unimplemented:
1392         return EXT2_ET_UNIMPLEMENTED;
1393 }
1394
1395 /*
1396  * If we know about ZERO_RANGE, try that before we try PUNCH_HOLE because
1397  * ZERO_RANGE doesn't unmap preallocated blocks.  We prefer fallocate because
1398  * it always invalidates page cache, and libext2fs requires that reads after
1399  * ZERO_RANGE return zeroes.
1400  */
1401 static int __unix_zeroout(int fd, off_t offset, off_t len)
1402 {
1403         int ret = -1;
1404
1405 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_ZERO_RANGE)
1406         ret = fallocate(fd, FALLOC_FL_ZERO_RANGE, offset, len);
1407         if (ret == 0)
1408                 return 0;
1409 #endif
1410 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
1411         ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1412                         offset,  len);
1413         if (ret == 0)
1414                 return 0;
1415 #endif
1416         errno = EOPNOTSUPP;
1417         return ret;
1418 }
1419
1420 /* parameters might not be used if OS doesn't support zeroout */
1421 #if __GNUC_PREREQ (4, 6)
1422 #pragma GCC diagnostic push
1423 #pragma GCC diagnostic ignored "-Wunused-parameter"
1424 #endif
1425 static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
1426                               unsigned long long count)
1427 {
1428         struct unix_private_data *data;
1429         int             ret;
1430
1431         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1432         data = (struct unix_private_data *) channel->private_data;
1433         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1434
1435         if (!(channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE)) {
1436                 /* Regular file, try to use truncate/punch/zero. */
1437                 struct stat statbuf;
1438
1439                 if (count == 0)
1440                         return 0;
1441                 /*
1442                  * If we're trying to zero a range past the end of the file,
1443                  * extend the file size, then truncate everything.
1444                  */
1445                 ret = fstat(data->dev, &statbuf);
1446                 if (ret)
1447                         goto err;
1448                 if ((unsigned long long) statbuf.st_size <
1449                         (block + count) * channel->block_size + data->offset) {
1450                         ret = ftruncate(data->dev,
1451                                         (block + count) * channel->block_size + data->offset);
1452                         if (ret)
1453                                 goto err;
1454                 }
1455         }
1456
1457         if (channel->flags & CHANNEL_FLAGS_NOZEROOUT)
1458                 goto unimplemented;
1459
1460         ret = __unix_zeroout(data->dev,
1461                         (off_t)(block) * channel->block_size + data->offset,
1462                         (off_t)(count) * channel->block_size);
1463 err:
1464         if (ret < 0) {
1465                 if (errno == EOPNOTSUPP) {
1466                         channel->flags |= CHANNEL_FLAGS_NOZEROOUT;
1467                         goto unimplemented;
1468                 }
1469                 return errno;
1470         }
1471         return 0;
1472 unimplemented:
1473         return EXT2_ET_UNIMPLEMENTED;
1474 }
1475 #if __GNUC_PREREQ (4, 6)
1476 #pragma GCC diagnostic pop
1477 #endif
1478
1479 static struct struct_io_manager struct_unix_manager = {
1480         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1481         .name           = "Unix I/O Manager",
1482         .open           = unix_open,
1483         .close          = unix_close,
1484         .set_blksize    = unix_set_blksize,
1485         .read_blk       = unix_read_blk,
1486         .write_blk      = unix_write_blk,
1487         .flush          = unix_flush,
1488         .write_byte     = unix_write_byte,
1489         .set_option     = unix_set_option,
1490         .get_stats      = unix_get_stats,
1491         .read_blk64     = unix_read_blk64,
1492         .write_blk64    = unix_write_blk64,
1493         .discard        = unix_discard,
1494         .cache_readahead        = unix_cache_readahead,
1495         .zeroout        = unix_zeroout,
1496 };
1497
1498 io_manager unix_io_manager = &struct_unix_manager;
1499
1500 static struct struct_io_manager struct_unixfd_manager = {
1501         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1502         .name           = "Unix fd I/O Manager",
1503         .open           = unixfd_open,
1504         .close          = unix_close,
1505         .set_blksize    = unix_set_blksize,
1506         .read_blk       = unix_read_blk,
1507         .write_blk      = unix_write_blk,
1508         .flush          = unix_flush,
1509         .write_byte     = unix_write_byte,
1510         .set_option     = unix_set_option,
1511         .get_stats      = unix_get_stats,
1512         .read_blk64     = unix_read_blk64,
1513         .write_blk64    = unix_write_blk64,
1514         .discard        = unix_discard,
1515         .cache_readahead        = unix_cache_readahead,
1516         .zeroout        = unix_zeroout,
1517 };
1518
1519 io_manager unixfd_io_manager = &struct_unixfd_manager;