Whamcloud - gitweb
Fix various clang and gcc -Wall warnings
[tools/e2fsprogs.git] / lib / ext2fs / unix_io.c
1 /*
2  * unix_io.c --- This is the Unix (well, really POSIX) implementation
3  *      of the I/O manager.
4  *
5  * Implements a one-block write-through cache.
6  *
7  * Includes support for Windows NT support under Cygwin.
8  *
9  * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10  *      2002 by Theodore Ts'o.
11  *
12  * %Begin-Header%
13  * This file may be redistributed under the terms of the GNU Library
14  * General Public License, version 2.
15  * %End-Header%
16  */
17
18 #define _XOPEN_SOURCE 600
19 #define _DARWIN_C_SOURCE
20 #define _FILE_OFFSET_BITS 64
21 #ifndef _LARGEFILE_SOURCE
22 #define _LARGEFILE_SOURCE
23 #endif
24 #ifndef _LARGEFILE64_SOURCE
25 #define _LARGEFILE64_SOURCE
26 #endif
27 #ifndef _GNU_SOURCE
28 #define _GNU_SOURCE
29 #endif
30
31 #include "config.h"
32 #include <stdio.h>
33 #include <string.h>
34 #if HAVE_UNISTD_H
35 #include <unistd.h>
36 #endif
37 #if HAVE_ERRNO_H
38 #include <errno.h>
39 #endif
40 #include <fcntl.h>
41 #include <time.h>
42 #ifdef __linux__
43 #include <sys/utsname.h>
44 #endif
45 #if HAVE_SYS_TYPES_H
46 #include <sys/types.h>
47 #endif
48 #ifdef HAVE_SYS_IOCTL_H
49 #include <sys/ioctl.h>
50 #endif
51 #ifdef HAVE_SYS_MOUNT_H
52 #include <sys/mount.h>
53 #endif
54 #if HAVE_SYS_STAT_H
55 #include <sys/stat.h>
56 #endif
57 #if HAVE_SYS_RESOURCE_H
58 #include <sys/resource.h>
59 #endif
60 #if HAVE_LINUX_FALLOC_H
61 #include <linux/falloc.h>
62 #endif
63
64 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
65 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
66 #endif
67
68 #undef ALIGN_DEBUG
69
70 #include "ext2_fs.h"
71 #include "ext2fs.h"
72
73 /*
74  * For checking structure magic numbers...
75  */
76
77 #define EXT2_CHECK_MAGIC(struct, code) \
78           if ((struct)->magic != (code)) return (code)
79
80 struct unix_cache {
81         char                    *buf;
82         unsigned long long      block;
83         int                     access_time;
84         unsigned                dirty:1;
85         unsigned                in_use:1;
86 };
87
88 #define CACHE_SIZE 8
89 #define WRITE_DIRECT_SIZE 4     /* Must be smaller than CACHE_SIZE */
90 #define READ_DIRECT_SIZE 4      /* Should be smaller than CACHE_SIZE */
91
92 struct unix_private_data {
93         int     magic;
94         int     dev;
95         int     flags;
96         int     align;
97         int     access_time;
98         ext2_loff_t offset;
99         struct unix_cache cache[CACHE_SIZE];
100         void    *bounce;
101         struct struct_io_stats io_stats;
102 };
103
104 #define IS_ALIGNED(n, align) ((((unsigned long) n) & \
105                                ((unsigned long) ((align)-1))) == 0)
106
107 static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
108 {
109         errcode_t       retval = 0;
110
111         struct unix_private_data *data;
112
113         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
114         data = (struct unix_private_data *) channel->private_data;
115         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
116
117         if (stats)
118                 *stats = &data->io_stats;
119
120         return retval;
121 }
122
123 /*
124  * Here are the raw I/O functions
125  */
126 static errcode_t raw_read_blk(io_channel channel,
127                               struct unix_private_data *data,
128                               unsigned long long block,
129                               int count, void *bufv)
130 {
131         errcode_t       retval;
132         ssize_t         size;
133         ext2_loff_t     location;
134         int             actual = 0;
135         unsigned char   *buf = bufv;
136
137         size = (count < 0) ? -count : count * channel->block_size;
138         data->io_stats.bytes_read += size;
139         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
140
141 #ifdef HAVE_PREAD64
142         /* Try an aligned pread */
143         if ((channel->align == 0) ||
144             (IS_ALIGNED(buf, channel->align) &&
145              IS_ALIGNED(size, channel->align))) {
146                 actual = pread64(data->dev, buf, size, location);
147                 if (actual == size)
148                         return 0;
149         }
150 #elif HAVE_PREAD
151         /* Try an aligned pread */
152         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
153             ((channel->align == 0) ||
154              (IS_ALIGNED(buf, channel->align) &&
155               IS_ALIGNED(size, channel->align)))) {
156                 actual = pread(data->dev, buf, size, location);
157                 if (actual == size)
158                         return 0;
159         }
160 #endif /* HAVE_PREAD */
161
162         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
163                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
164                 goto error_out;
165         }
166         if ((channel->align == 0) ||
167             (IS_ALIGNED(buf, channel->align) &&
168              IS_ALIGNED(size, channel->align))) {
169                 actual = read(data->dev, buf, size);
170                 if (actual != size) {
171                 short_read:
172                         if (actual < 0)
173                                 actual = 0;
174                         retval = EXT2_ET_SHORT_READ;
175                         goto error_out;
176                 }
177                 return 0;
178         }
179
180 #ifdef ALIGN_DEBUG
181         printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
182                (unsigned long) size);
183 #endif
184
185         /*
186          * The buffer or size which we're trying to read isn't aligned
187          * to the O_DIRECT rules, so we need to do this the hard way...
188          */
189         while (size > 0) {
190                 actual = read(data->dev, data->bounce, channel->block_size);
191                 if (actual != channel->block_size)
192                         goto short_read;
193                 actual = size;
194                 if (size > channel->block_size)
195                         actual = channel->block_size;
196                 memcpy(buf, data->bounce, actual);
197                 size -= actual;
198                 buf += actual;
199         }
200         return 0;
201
202 error_out:
203         memset((char *) buf+actual, 0, size-actual);
204         if (channel->read_error)
205                 retval = (channel->read_error)(channel, block, count, buf,
206                                                size, actual, retval);
207         return retval;
208 }
209
210 static errcode_t raw_write_blk(io_channel channel,
211                                struct unix_private_data *data,
212                                unsigned long long block,
213                                int count, const void *bufv)
214 {
215         ssize_t         size;
216         ext2_loff_t     location;
217         int             actual = 0;
218         errcode_t       retval;
219         const unsigned char *buf = bufv;
220
221         if (count == 1)
222                 size = channel->block_size;
223         else {
224                 if (count < 0)
225                         size = -count;
226                 else
227                         size = count * channel->block_size;
228         }
229         data->io_stats.bytes_written += size;
230
231         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
232
233 #ifdef HAVE_PWRITE64
234         /* Try an aligned pwrite */
235         if ((channel->align == 0) ||
236             (IS_ALIGNED(buf, channel->align) &&
237              IS_ALIGNED(size, channel->align))) {
238                 actual = pwrite64(data->dev, buf, size, location);
239                 if (actual == size)
240                         return 0;
241         }
242 #elif HAVE_PWRITE
243         /* Try an aligned pwrite */
244         if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
245             ((channel->align == 0) ||
246              (IS_ALIGNED(buf, channel->align) &&
247               IS_ALIGNED(size, channel->align)))) {
248                 actual = pwrite(data->dev, buf, size, location);
249                 if (actual == size)
250                         return 0;
251         }
252 #endif /* HAVE_PWRITE */
253
254         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
255                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
256                 goto error_out;
257         }
258
259         if ((channel->align == 0) ||
260             (IS_ALIGNED(buf, channel->align) &&
261              IS_ALIGNED(size, channel->align))) {
262                 actual = write(data->dev, buf, size);
263                 if (actual != size) {
264                 short_write:
265                         retval = EXT2_ET_SHORT_WRITE;
266                         goto error_out;
267                 }
268                 return 0;
269         }
270
271 #ifdef ALIGN_DEBUG
272         printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
273                (unsigned long) size);
274 #endif
275         /*
276          * The buffer or size which we're trying to write isn't aligned
277          * to the O_DIRECT rules, so we need to do this the hard way...
278          */
279         while (size > 0) {
280                 if (size < channel->block_size) {
281                         actual = read(data->dev, data->bounce,
282                                       channel->block_size);
283                         if (actual != channel->block_size) {
284                                 retval = EXT2_ET_SHORT_READ;
285                                 goto error_out;
286                         }
287                 }
288                 actual = size;
289                 if (size > channel->block_size)
290                         actual = channel->block_size;
291                 memcpy(data->bounce, buf, actual);
292                 actual = write(data->dev, data->bounce, channel->block_size);
293                 if (actual != channel->block_size)
294                         goto short_write;
295                 size -= actual;
296                 buf += actual;
297         }
298         return 0;
299
300 error_out:
301         if (channel->write_error)
302                 retval = (channel->write_error)(channel, block, count, buf,
303                                                 size, actual, retval);
304         return retval;
305 }
306
307
308 /*
309  * Here we implement the cache functions
310  */
311
312 /* Allocate the cache buffers */
313 static errcode_t alloc_cache(io_channel channel,
314                              struct unix_private_data *data)
315 {
316         errcode_t               retval;
317         struct unix_cache       *cache;
318         int                     i;
319
320         data->access_time = 0;
321         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
322                 cache->block = 0;
323                 cache->access_time = 0;
324                 cache->dirty = 0;
325                 cache->in_use = 0;
326                 if (cache->buf)
327                         ext2fs_free_mem(&cache->buf);
328                 retval = io_channel_alloc_buf(channel, 0, &cache->buf);
329                 if (retval)
330                         return retval;
331         }
332         if (channel->align) {
333                 if (data->bounce)
334                         ext2fs_free_mem(&data->bounce);
335                 retval = io_channel_alloc_buf(channel, 0, &data->bounce);
336         }
337         return retval;
338 }
339
340 /* Free the cache buffers */
341 static void free_cache(struct unix_private_data *data)
342 {
343         struct unix_cache       *cache;
344         int                     i;
345
346         data->access_time = 0;
347         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
348                 cache->block = 0;
349                 cache->access_time = 0;
350                 cache->dirty = 0;
351                 cache->in_use = 0;
352                 if (cache->buf)
353                         ext2fs_free_mem(&cache->buf);
354         }
355         if (data->bounce)
356                 ext2fs_free_mem(&data->bounce);
357 }
358
359 #ifndef NO_IO_CACHE
360 /*
361  * Try to find a block in the cache.  If the block is not found, and
362  * eldest is a non-zero pointer, then fill in eldest with the cache
363  * entry to that should be reused.
364  */
365 static struct unix_cache *find_cached_block(struct unix_private_data *data,
366                                             unsigned long long block,
367                                             struct unix_cache **eldest)
368 {
369         struct unix_cache       *cache, *unused_cache, *oldest_cache;
370         int                     i;
371
372         unused_cache = oldest_cache = 0;
373         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
374                 if (!cache->in_use) {
375                         if (!unused_cache)
376                                 unused_cache = cache;
377                         continue;
378                 }
379                 if (cache->block == block) {
380                         cache->access_time = ++data->access_time;
381                         return cache;
382                 }
383                 if (!oldest_cache ||
384                     (cache->access_time < oldest_cache->access_time))
385                         oldest_cache = cache;
386         }
387         if (eldest)
388                 *eldest = (unused_cache) ? unused_cache : oldest_cache;
389         return 0;
390 }
391
392 /*
393  * Reuse a particular cache entry for another block.
394  */
395 static void reuse_cache(io_channel channel, struct unix_private_data *data,
396                  struct unix_cache *cache, unsigned long long block)
397 {
398         if (cache->dirty && cache->in_use)
399                 raw_write_blk(channel, data, cache->block, 1, cache->buf);
400
401         cache->in_use = 1;
402         cache->dirty = 0;
403         cache->block = block;
404         cache->access_time = ++data->access_time;
405 }
406
407 /*
408  * Flush all of the blocks in the cache
409  */
410 static errcode_t flush_cached_blocks(io_channel channel,
411                                      struct unix_private_data *data,
412                                      int invalidate)
413
414 {
415         struct unix_cache       *cache;
416         errcode_t               retval, retval2;
417         int                     i;
418
419         retval2 = 0;
420         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
421                 if (!cache->in_use)
422                         continue;
423
424                 if (invalidate)
425                         cache->in_use = 0;
426
427                 if (!cache->dirty)
428                         continue;
429
430                 retval = raw_write_blk(channel, data,
431                                        cache->block, 1, cache->buf);
432                 if (retval)
433                         retval2 = retval;
434                 else
435                         cache->dirty = 0;
436         }
437         return retval2;
438 }
439 #endif /* NO_IO_CACHE */
440
441 #ifdef __linux__
442 #ifndef BLKDISCARDZEROES
443 #define BLKDISCARDZEROES _IO(0x12,124)
444 #endif
445 #endif
446
447 int ext2fs_open_file(const char *pathname, int flags, mode_t mode)
448 {
449         if (mode)
450 #if defined(HAVE_OPEN64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
451                 return open64(pathname, flags, mode);
452         else
453                 return open64(pathname, flags);
454 #else
455                 return open(pathname, flags, mode);
456         else
457                 return open(pathname, flags);
458 #endif
459 }
460
461 int ext2fs_stat(const char *path, ext2fs_struct_stat *buf)
462 {
463 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
464         return stat64(path, buf);
465 #else
466         return stat(path, buf);
467 #endif
468 }
469
470 int ext2fs_fstat(int fd, ext2fs_struct_stat *buf)
471 {
472 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
473         return fstat64(fd, buf);
474 #else
475         return fstat(fd, buf);
476 #endif
477 }
478
479 static errcode_t unix_open(const char *name, int flags, io_channel *channel)
480 {
481         io_channel      io = NULL;
482         struct unix_private_data *data = NULL;
483         errcode_t       retval;
484         int             open_flags;
485         int             f_nocache = 0;
486         ext2fs_struct_stat st;
487 #ifdef __linux__
488         struct          utsname ut;
489 #endif
490
491         if (name == 0)
492                 return EXT2_ET_BAD_DEVICE_NAME;
493         retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
494         if (retval)
495                 goto cleanup;
496         memset(io, 0, sizeof(struct struct_io_channel));
497         io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
498         retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
499         if (retval)
500                 goto cleanup;
501
502         io->manager = unix_io_manager;
503         retval = ext2fs_get_mem(strlen(name)+1, &io->name);
504         if (retval)
505                 goto cleanup;
506
507         strcpy(io->name, name);
508         io->private_data = data;
509         io->block_size = 1024;
510         io->read_error = 0;
511         io->write_error = 0;
512         io->refcount = 1;
513
514         memset(data, 0, sizeof(struct unix_private_data));
515         data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
516         data->io_stats.num_fields = 2;
517         data->dev = -1;
518
519         open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
520         if (flags & IO_FLAG_EXCLUSIVE)
521                 open_flags |= O_EXCL;
522 #if defined(O_DIRECT)
523         if (flags & IO_FLAG_DIRECT_IO) {
524                 open_flags |= O_DIRECT;
525                 io->align = ext2fs_get_dio_alignment(data->dev);
526         }
527 #elif defined(F_NOCACHE)
528         if (flags & IO_FLAG_DIRECT_IO) {
529                 f_nocache = F_NOCACHE;
530                 io->align = 4096;
531         }
532 #endif
533         data->flags = flags;
534
535         data->dev = ext2fs_open_file(io->name, open_flags, 0);
536         if (data->dev < 0) {
537                 retval = errno;
538                 goto cleanup;
539         }
540         if (f_nocache) {
541                 if (fcntl(data->dev, f_nocache, 1) < 0) {
542                         retval = errno;
543                         goto cleanup;
544                 }
545         }
546
547         /*
548          * If the device is really a block device, then set the
549          * appropriate flag, otherwise we can set DISCARD_ZEROES flag
550          * because we are going to use punch hole instead of discard
551          * and if it succeed, subsequent read from sparse area returns
552          * zero.
553          */
554         if (ext2fs_stat(io->name, &st) == 0) {
555                 if (S_ISBLK(st.st_mode))
556                         io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
557                 else
558                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
559         }
560
561 #ifdef BLKDISCARDZEROES
562         {
563                 int zeroes = 0;
564                 if (ioctl(data->dev, BLKDISCARDZEROES, &zeroes) == 0 &&
565                     zeroes)
566                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
567         }
568 #endif
569
570 #if defined(__CYGWIN__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
571         /*
572          * Some operating systems require that the buffers be aligned,
573          * regardless of O_DIRECT
574          */
575         if (!io->align)
576                 io->align = 512;
577 #endif
578
579
580         if ((retval = alloc_cache(io, data)))
581                 goto cleanup;
582
583 #ifdef BLKROGET
584         if (flags & IO_FLAG_RW) {
585                 int error;
586                 int readonly = 0;
587
588                 /* Is the block device actually writable? */
589                 error = ioctl(data->dev, BLKROGET, &readonly);
590                 if (!error && readonly) {
591                         retval = EPERM;
592                         goto cleanup;
593                 }
594         }
595 #endif
596
597 #ifdef __linux__
598 #undef RLIM_INFINITY
599 #if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
600 #define RLIM_INFINITY   ((unsigned long)(~0UL>>1))
601 #else
602 #define RLIM_INFINITY  (~0UL)
603 #endif
604         /*
605          * Work around a bug in 2.4.10-2.4.18 kernels where writes to
606          * block devices are wrongly getting hit by the filesize
607          * limit.  This workaround isn't perfect, since it won't work
608          * if glibc wasn't built against 2.2 header files.  (Sigh.)
609          *
610          */
611         if ((flags & IO_FLAG_RW) &&
612             (uname(&ut) == 0) &&
613             ((ut.release[0] == '2') && (ut.release[1] == '.') &&
614              (ut.release[2] == '4') && (ut.release[3] == '.') &&
615              (ut.release[4] == '1') && (ut.release[5] >= '0') &&
616              (ut.release[5] < '8')) &&
617             (ext2fs_stat(io->name, &st) == 0) &&
618             (S_ISBLK(st.st_mode))) {
619                 struct rlimit   rlim;
620
621                 rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
622                 setrlimit(RLIMIT_FSIZE, &rlim);
623                 getrlimit(RLIMIT_FSIZE, &rlim);
624                 if (((unsigned long) rlim.rlim_cur) <
625                     ((unsigned long) rlim.rlim_max)) {
626                         rlim.rlim_cur = rlim.rlim_max;
627                         setrlimit(RLIMIT_FSIZE, &rlim);
628                 }
629         }
630 #endif
631         *channel = io;
632         return 0;
633
634 cleanup:
635         if (data) {
636                 if (data->dev >= 0)
637                         close(data->dev);
638                 free_cache(data);
639                 ext2fs_free_mem(&data);
640         }
641         if (io) {
642                 if (io->name) {
643                         ext2fs_free_mem(&io->name);
644                 }
645                 ext2fs_free_mem(&io);
646         }
647         return retval;
648 }
649
650 static errcode_t unix_close(io_channel channel)
651 {
652         struct unix_private_data *data;
653         errcode_t       retval = 0;
654
655         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
656         data = (struct unix_private_data *) channel->private_data;
657         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
658
659         if (--channel->refcount > 0)
660                 return 0;
661
662 #ifndef NO_IO_CACHE
663         retval = flush_cached_blocks(channel, data, 0);
664 #endif
665
666         if (close(data->dev) < 0)
667                 retval = errno;
668         free_cache(data);
669
670         ext2fs_free_mem(&channel->private_data);
671         if (channel->name)
672                 ext2fs_free_mem(&channel->name);
673         ext2fs_free_mem(&channel);
674         return retval;
675 }
676
677 static errcode_t unix_set_blksize(io_channel channel, int blksize)
678 {
679         struct unix_private_data *data;
680         errcode_t               retval;
681
682         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
683         data = (struct unix_private_data *) channel->private_data;
684         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
685
686         if (channel->block_size != blksize) {
687 #ifndef NO_IO_CACHE
688                 if ((retval = flush_cached_blocks(channel, data, 0)))
689                         return retval;
690 #endif
691
692                 channel->block_size = blksize;
693                 free_cache(data);
694                 if ((retval = alloc_cache(channel, data)))
695                         return retval;
696         }
697         return 0;
698 }
699
700
701 static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
702                                int count, void *buf)
703 {
704         struct unix_private_data *data;
705         struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
706         errcode_t       retval;
707         char            *cp;
708         int             i, j;
709
710         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
711         data = (struct unix_private_data *) channel->private_data;
712         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
713
714 #ifdef NO_IO_CACHE
715         return raw_read_blk(channel, data, block, count, buf);
716 #else
717         /*
718          * If we're doing an odd-sized read or a very large read,
719          * flush out the cache and then do a direct read.
720          */
721         if (count < 0 || count > WRITE_DIRECT_SIZE) {
722                 if ((retval = flush_cached_blocks(channel, data, 0)))
723                         return retval;
724                 return raw_read_blk(channel, data, block, count, buf);
725         }
726
727         cp = buf;
728         while (count > 0) {
729                 /* If it's in the cache, use it! */
730                 if ((cache = find_cached_block(data, block, &reuse[0]))) {
731 #ifdef DEBUG
732                         printf("Using cached block %lu\n", block);
733 #endif
734                         memcpy(cp, cache->buf, channel->block_size);
735                         count--;
736                         block++;
737                         cp += channel->block_size;
738                         continue;
739                 }
740                 if (count == 1) {
741                         /*
742                          * Special case where we read directly into the
743                          * cache buffer; important in the O_DIRECT case
744                          */
745                         cache = reuse[0];
746                         reuse_cache(channel, data, cache, block);
747                         if ((retval = raw_read_blk(channel, data, block, 1,
748                                                    cache->buf))) {
749                                 cache->in_use = 0;
750                                 return retval;
751                         }
752                         memcpy(cp, cache->buf, channel->block_size);
753                         return 0;
754                 }
755
756                 /*
757                  * Find the number of uncached blocks so we can do a
758                  * single read request
759                  */
760                 for (i=1; i < count; i++)
761                         if (find_cached_block(data, block+i, &reuse[i]))
762                                 break;
763 #ifdef DEBUG
764                 printf("Reading %d blocks starting at %lu\n", i, block);
765 #endif
766                 if ((retval = raw_read_blk(channel, data, block, i, cp)))
767                         return retval;
768
769                 /* Save the results in the cache */
770                 for (j=0; j < i; j++) {
771                         count--;
772                         cache = reuse[j];
773                         reuse_cache(channel, data, cache, block++);
774                         memcpy(cache->buf, cp, channel->block_size);
775                         cp += channel->block_size;
776                 }
777         }
778         return 0;
779 #endif /* NO_IO_CACHE */
780 }
781
782 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
783                                int count, void *buf)
784 {
785         return unix_read_blk64(channel, block, count, buf);
786 }
787
788 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
789                                 int count, const void *buf)
790 {
791         struct unix_private_data *data;
792         struct unix_cache *cache, *reuse;
793         errcode_t       retval = 0;
794         const char      *cp;
795         int             writethrough;
796
797         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
798         data = (struct unix_private_data *) channel->private_data;
799         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
800
801 #ifdef NO_IO_CACHE
802         return raw_write_blk(channel, data, block, count, buf);
803 #else
804         /*
805          * If we're doing an odd-sized write or a very large write,
806          * flush out the cache completely and then do a direct write.
807          */
808         if (count < 0 || count > WRITE_DIRECT_SIZE) {
809                 if ((retval = flush_cached_blocks(channel, data, 1)))
810                         return retval;
811                 return raw_write_blk(channel, data, block, count, buf);
812         }
813
814         /*
815          * For a moderate-sized multi-block write, first force a write
816          * if we're in write-through cache mode, and then fill the
817          * cache with the blocks.
818          */
819         writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
820         if (writethrough)
821                 retval = raw_write_blk(channel, data, block, count, buf);
822
823         cp = buf;
824         while (count > 0) {
825                 cache = find_cached_block(data, block, &reuse);
826                 if (!cache) {
827                         cache = reuse;
828                         reuse_cache(channel, data, cache, block);
829                 }
830                 if (cache->buf != cp)
831                         memcpy(cache->buf, cp, channel->block_size);
832                 cache->dirty = !writethrough;
833                 count--;
834                 block++;
835                 cp += channel->block_size;
836         }
837         return retval;
838 #endif /* NO_IO_CACHE */
839 }
840
841 static errcode_t unix_cache_readahead(io_channel channel,
842                                       unsigned long long block,
843                                       unsigned long long count)
844 {
845 #ifdef POSIX_FADV_WILLNEED
846         struct unix_private_data *data;
847
848         data = (struct unix_private_data *)channel->private_data;
849         return posix_fadvise(data->dev,
850                              (ext2_loff_t)block * channel->block_size,
851                              (ext2_loff_t)count * channel->block_size,
852                              POSIX_FADV_WILLNEED);
853 #else
854         return EXT2_ET_OP_NOT_SUPPORTED;
855 #endif
856 }
857
858 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
859                                 int count, const void *buf)
860 {
861         return unix_write_blk64(channel, block, count, buf);
862 }
863
864 static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
865                                  int size, const void *buf)
866 {
867         struct unix_private_data *data;
868         errcode_t       retval = 0;
869         ssize_t         actual;
870
871         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
872         data = (struct unix_private_data *) channel->private_data;
873         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
874
875         if (channel->align != 0) {
876 #ifdef ALIGN_DEBUG
877                 printf("unix_write_byte: O_DIRECT fallback\n");
878 #endif
879                 return EXT2_ET_UNIMPLEMENTED;
880         }
881
882 #ifndef NO_IO_CACHE
883         /*
884          * Flush out the cache completely
885          */
886         if ((retval = flush_cached_blocks(channel, data, 1)))
887                 return retval;
888 #endif
889
890         if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
891                 return errno;
892
893         actual = write(data->dev, buf, size);
894         if (actual != size)
895                 return EXT2_ET_SHORT_WRITE;
896
897         return 0;
898 }
899
900 /*
901  * Flush data buffers to disk.
902  */
903 static errcode_t unix_flush(io_channel channel)
904 {
905         struct unix_private_data *data;
906         errcode_t retval = 0;
907
908         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
909         data = (struct unix_private_data *) channel->private_data;
910         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
911
912 #ifndef NO_IO_CACHE
913         retval = flush_cached_blocks(channel, data, 0);
914 #endif
915         fsync(data->dev);
916         return retval;
917 }
918
919 static errcode_t unix_set_option(io_channel channel, const char *option,
920                                  const char *arg)
921 {
922         struct unix_private_data *data;
923         unsigned long long tmp;
924         char *end;
925
926         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
927         data = (struct unix_private_data *) channel->private_data;
928         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
929
930         if (!strcmp(option, "offset")) {
931                 if (!arg)
932                         return EXT2_ET_INVALID_ARGUMENT;
933
934                 tmp = strtoull(arg, &end, 0);
935                 if (*end)
936                         return EXT2_ET_INVALID_ARGUMENT;
937                 data->offset = tmp;
938                 if (data->offset < 0)
939                         return EXT2_ET_INVALID_ARGUMENT;
940                 return 0;
941         }
942         return EXT2_ET_INVALID_ARGUMENT;
943 }
944
945 #if defined(__linux__) && !defined(BLKDISCARD)
946 #define BLKDISCARD              _IO(0x12,119)
947 #endif
948
949 static errcode_t unix_discard(io_channel channel, unsigned long long block,
950                               unsigned long long count)
951 {
952         struct unix_private_data *data;
953         int             ret;
954
955         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
956         data = (struct unix_private_data *) channel->private_data;
957         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
958
959         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
960 #ifdef BLKDISCARD
961                 __u64 range[2];
962
963                 range[0] = (__u64)(block) * channel->block_size;
964                 range[1] = (__u64)(count) * channel->block_size;
965
966                 ret = ioctl(data->dev, BLKDISCARD, &range);
967 #else
968                 goto unimplemented;
969 #endif
970         } else {
971 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
972                 /*
973                  * If we are not on block device, try to use punch hole
974                  * to reclaim free space.
975                  */
976                 ret = fallocate(data->dev,
977                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
978                                 (off_t)(block) * channel->block_size,
979                                 (off_t)(count) * channel->block_size);
980 #else
981                 goto unimplemented;
982 #endif
983         }
984         if (ret < 0) {
985                 if (errno == EOPNOTSUPP)
986                         goto unimplemented;
987                 return errno;
988         }
989         return 0;
990 unimplemented:
991         return EXT2_ET_UNIMPLEMENTED;
992 }
993
994 /* parameters might not be used if OS doesn't support zeroout */
995 #pragma GCC diagnostic push
996 #pragma GCC diagnostic ignored "-Wunused-parameter"
997 static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
998                               unsigned long long count)
999 {
1000         struct unix_private_data *data;
1001         int             ret;
1002
1003         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
1004         data = (struct unix_private_data *) channel->private_data;
1005         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
1006
1007         if (getenv("UNIX_IO_NOZEROOUT"))
1008                 goto unimplemented;
1009
1010         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
1011                 /* Not implemented until the BLKZEROOUT mess is fixed */
1012                 goto unimplemented;
1013         } else {
1014                 /* Regular file, try to use truncate/punch/zero. */
1015                 struct stat statbuf;
1016
1017                 if (count == 0)
1018                         return 0;
1019                 /*
1020                  * If we're trying to zero a range past the end of the file,
1021                  * extend the file size, then truncate everything.
1022                  */
1023                 ret = fstat(data->dev, &statbuf);
1024                 if (ret)
1025                         goto err;
1026                 if ((unsigned long long) statbuf.st_size < (block + count) * channel->block_size) {
1027                         ret = ftruncate(data->dev,
1028                                         (block + count) * channel->block_size);
1029                         if (ret)
1030                                 goto err;
1031                 }
1032 #if defined(HAVE_FALLOCATE) && (defined(FALLOC_FL_ZERO_RANGE) || \
1033         (defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)))
1034 #if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
1035                 ret = fallocate(data->dev,
1036                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1037                                 (off_t)(block) * channel->block_size,
1038                                 (off_t)(count) * channel->block_size);
1039                 if (ret == 0)
1040                         goto err;
1041 #endif
1042 #ifdef FALLOC_FL_ZERO_RANGE
1043                 ret = fallocate(data->dev,
1044                                 FALLOC_FL_ZERO_RANGE,
1045                                 (off_t)(block) * channel->block_size,
1046                                 (off_t)(count) * channel->block_size);
1047 #endif
1048 #else
1049                 goto unimplemented;
1050 #endif /* HAVE_FALLOCATE && (ZERO_RANGE || (PUNCH_HOLE && KEEP_SIZE)) */
1051         }
1052 err:
1053         if (ret < 0) {
1054                 if (errno == EOPNOTSUPP)
1055                         goto unimplemented;
1056                 return errno;
1057         }
1058         return 0;
1059 unimplemented:
1060         return EXT2_ET_UNIMPLEMENTED;
1061 }
1062 #pragma GCC diagnostic pop
1063
1064 static struct struct_io_manager struct_unix_manager = {
1065         .magic          = EXT2_ET_MAGIC_IO_MANAGER,
1066         .name           = "Unix I/O Manager",
1067         .open           = unix_open,
1068         .close          = unix_close,
1069         .set_blksize    = unix_set_blksize,
1070         .read_blk       = unix_read_blk,
1071         .write_blk      = unix_write_blk,
1072         .flush          = unix_flush,
1073         .write_byte     = unix_write_byte,
1074         .set_option     = unix_set_option,
1075         .get_stats      = unix_get_stats,
1076         .read_blk64     = unix_read_blk64,
1077         .write_blk64    = unix_write_blk64,
1078         .discard        = unix_discard,
1079         .cache_readahead        = unix_cache_readahead,
1080         .zeroout        = unix_zeroout,
1081 };
1082
1083 io_manager unix_io_manager = &struct_unix_manager;