Whamcloud - gitweb
build: fix compile warnings on OSX
[tools/e2fsprogs.git] / lib / ext2fs / unix_io.c
1 /*
2  * unix_io.c --- This is the Unix (well, really POSIX) implementation
3  *      of the I/O manager.
4  *
5  * Implements a one-block write-through cache.
6  *
7  * Includes support for Windows NT support under Cygwin.
8  *
9  * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10  *      2002 by Theodore Ts'o.
11  *
12  * %Begin-Header%
13  * This file may be redistributed under the terms of the GNU Library
14  * General Public License, version 2.
15  * %End-Header%
16  */
17
18 #define _LARGEFILE_SOURCE
19 #define _LARGEFILE64_SOURCE
20 #ifndef _GNU_SOURCE
21 #define _GNU_SOURCE
22 #endif
23
24 #include "config.h"
25 #include <stdio.h>
26 #include <string.h>
27 #if HAVE_UNISTD_H
28 #include <unistd.h>
29 #endif
30 #if HAVE_ERRNO_H
31 #include <errno.h>
32 #endif
33 #include <fcntl.h>
34 #include <time.h>
35 #ifdef __linux__
36 #include <sys/utsname.h>
37 #endif
38 #ifdef HAVE_SYS_IOCTL_H
39 #include <sys/ioctl.h>
40 #endif
41 #ifdef HAVE_SYS_MOUNT_H
42 #include <sys/mount.h>
43 #endif
44 #if HAVE_SYS_STAT_H
45 #include <sys/stat.h>
46 #endif
47 #if HAVE_SYS_TYPES_H
48 #include <sys/types.h>
49 #endif
50 #if HAVE_SYS_RESOURCE_H
51 #include <sys/resource.h>
52 #endif
53 #if HAVE_LINUX_FALLOC_H
54 #include <linux/falloc.h>
55 #endif
56
57 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
58 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
59 #endif
60
61 #if defined(__linux__) && defined(_IO) && !defined(BLKSSZGET)
62 #define BLKSSZGET  _IO(0x12,104)/* get block device sector size */
63 #endif
64
65 #undef ALIGN_DEBUG
66
67 #include "ext2_fs.h"
68 #include "ext2fs.h"
69
70 /*
71  * For checking structure magic numbers...
72  */
73
74 #define EXT2_CHECK_MAGIC(struct, code) \
75           if ((struct)->magic != (code)) return (code)
76
77 struct unix_cache {
78         char            *buf;
79         unsigned long   block;
80         int             access_time;
81         unsigned        dirty:1;
82         unsigned        in_use:1;
83 };
84
85 #define CACHE_SIZE 8
86 #define WRITE_DIRECT_SIZE 4     /* Must be smaller than CACHE_SIZE */
87 #define READ_DIRECT_SIZE 4      /* Should be smaller than CACHE_SIZE */
88
89 struct unix_private_data {
90         int     magic;
91         int     dev;
92         int     flags;
93         int     align;
94         int     access_time;
95         ext2_loff_t offset;
96         struct unix_cache cache[CACHE_SIZE];
97         void    *bounce;
98         struct struct_io_stats io_stats;
99 };
100
101 #define IS_ALIGNED(n, align) ((((unsigned long) n) & \
102                                ((unsigned long) ((align)-1))) == 0)
103
104 static errcode_t unix_open(const char *name, int flags, io_channel *channel);
105 static errcode_t unix_close(io_channel channel);
106 static errcode_t unix_set_blksize(io_channel channel, int blksize);
107 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
108                                int count, void *data);
109 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
110                                 int count, const void *data);
111 static errcode_t unix_flush(io_channel channel);
112 static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
113                                 int size, const void *data);
114 static errcode_t unix_set_option(io_channel channel, const char *option,
115                                  const char *arg);
116 static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
117 ;
118 static void reuse_cache(io_channel channel, struct unix_private_data *data,
119                  struct unix_cache *cache, unsigned long long block);
120 static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
121                                int count, void *data);
122 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
123                                 int count, const void *data);
124 static errcode_t unix_discard(io_channel channel, unsigned long long block,
125                               unsigned long long count);
126
127 static struct struct_io_manager struct_unix_manager = {
128         EXT2_ET_MAGIC_IO_MANAGER,
129         "Unix I/O Manager",
130         unix_open,
131         unix_close,
132         unix_set_blksize,
133         unix_read_blk,
134         unix_write_blk,
135         unix_flush,
136         unix_write_byte,
137         unix_set_option,
138         unix_get_stats,
139         unix_read_blk64,
140         unix_write_blk64,
141         unix_discard,
142 };
143
144 io_manager unix_io_manager = &struct_unix_manager;
145
146 static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
147 {
148         errcode_t       retval = 0;
149
150         struct unix_private_data *data;
151
152         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
153         data = (struct unix_private_data *) channel->private_data;
154         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
155
156         if (stats)
157                 *stats = &data->io_stats;
158
159         return retval;
160 }
161
162 /*
163  * Here are the raw I/O functions
164  */
165 static errcode_t raw_read_blk(io_channel channel,
166                               struct unix_private_data *data,
167                               unsigned long long block,
168                               int count, void *bufv)
169 {
170         errcode_t       retval;
171         ssize_t         size;
172         ext2_loff_t     location;
173         int             actual = 0;
174         unsigned char   *buf = bufv;
175
176         size = (count < 0) ? -count : count * channel->block_size;
177         data->io_stats.bytes_read += size;
178         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
179         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
180                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
181                 goto error_out;
182         }
183         if ((data->align == 0) ||
184             ((IS_ALIGNED(buf, data->align)) && IS_ALIGNED(size, data->align))) {
185                 actual = read(data->dev, buf, size);
186                 if (actual != size) {
187                 short_read:
188                         if (actual < 0)
189                                 actual = 0;
190                         retval = EXT2_ET_SHORT_READ;
191                         goto error_out;
192                 }
193                 return 0;
194         }
195
196 #ifdef ALIGN_DEBUG
197         printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
198                (unsigned long) size);
199 #endif
200
201         /*
202          * The buffer or size which we're trying to read isn't aligned
203          * to the O_DIRECT rules, so we need to do this the hard way...
204          */
205         while (size > 0) {
206                 actual = read(data->dev, data->bounce, channel->block_size);
207                 if (actual != channel->block_size)
208                         goto short_read;
209                 actual = size;
210                 if (size > channel->block_size)
211                         actual = channel->block_size;
212                 memcpy(buf, data->bounce, actual);
213                 size -= actual;
214                 buf += actual;
215         }
216         return 0;
217
218 error_out:
219         memset((char *) buf+actual, 0, size-actual);
220         if (channel->read_error)
221                 retval = (channel->read_error)(channel, block, count, buf,
222                                                size, actual, retval);
223         return retval;
224 }
225
226 static errcode_t raw_write_blk(io_channel channel,
227                                struct unix_private_data *data,
228                                unsigned long long block,
229                                int count, const void *bufv)
230 {
231         ssize_t         size;
232         ext2_loff_t     location;
233         int             actual = 0;
234         errcode_t       retval;
235         const unsigned char *buf = bufv;
236
237         if (count == 1)
238                 size = channel->block_size;
239         else {
240                 if (count < 0)
241                         size = -count;
242                 else
243                         size = count * channel->block_size;
244         }
245         data->io_stats.bytes_written += size;
246
247         location = ((ext2_loff_t) block * channel->block_size) + data->offset;
248         if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
249                 retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
250                 goto error_out;
251         }
252
253         if ((data->align == 0) ||
254             ((IS_ALIGNED(buf, data->align)) && IS_ALIGNED(size, data->align))) {
255                 actual = write(data->dev, buf, size);
256                 if (actual != size) {
257                 short_write:
258                         retval = EXT2_ET_SHORT_WRITE;
259                         goto error_out;
260                 }
261                 return 0;
262         }
263
264 #ifdef ALIGN_DEBUG
265         printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
266                (unsigned long) size);
267 #endif
268         /*
269          * The buffer or size which we're trying to write isn't aligned
270          * to the O_DIRECT rules, so we need to do this the hard way...
271          */
272         while (size > 0) {
273                 if (size < channel->block_size) {
274                         actual = read(data->dev, data->bounce,
275                                       channel->block_size);
276                         if (actual != channel->block_size) {
277                                 retval = EXT2_ET_SHORT_READ;
278                                 goto error_out;
279                         }
280                 }
281                 actual = size;
282                 if (size > channel->block_size)
283                         actual = channel->block_size;
284                 memcpy(data->bounce, buf, actual);
285                 actual = write(data->dev, data->bounce, channel->block_size);
286                 if (actual != channel->block_size)
287                         goto short_write;
288                 size -= actual;
289                 buf += actual;
290         }
291         return 0;
292
293 error_out:
294         if (channel->write_error)
295                 retval = (channel->write_error)(channel, block, count, buf,
296                                                 size, actual, retval);
297         return retval;
298 }
299
300
301 /*
302  * Here we implement the cache functions
303  */
304
305 /* Allocate the cache buffers */
306 static errcode_t alloc_cache(io_channel channel,
307                              struct unix_private_data *data)
308 {
309         errcode_t               retval;
310         struct unix_cache       *cache;
311         int                     i;
312
313         data->access_time = 0;
314         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
315                 cache->block = 0;
316                 cache->access_time = 0;
317                 cache->dirty = 0;
318                 cache->in_use = 0;
319                 if (cache->buf)
320                         ext2fs_free_mem(&cache->buf);
321                 retval = ext2fs_get_memalign(channel->block_size,
322                                              data->align, &cache->buf);
323                 if (retval)
324                         return retval;
325         }
326         if (data->align) {
327                 if (data->bounce)
328                         ext2fs_free_mem(&data->bounce);
329                 retval = ext2fs_get_memalign(channel->block_size, data->align,
330                                              &data->bounce);
331         }
332         return retval;
333 }
334
335 /* Free the cache buffers */
336 static void free_cache(struct unix_private_data *data)
337 {
338         struct unix_cache       *cache;
339         int                     i;
340
341         data->access_time = 0;
342         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
343                 cache->block = 0;
344                 cache->access_time = 0;
345                 cache->dirty = 0;
346                 cache->in_use = 0;
347                 if (cache->buf)
348                         ext2fs_free_mem(&cache->buf);
349         }
350         if (data->bounce)
351                 ext2fs_free_mem(&data->bounce);
352 }
353
354 #ifndef NO_IO_CACHE
355 /*
356  * Try to find a block in the cache.  If the block is not found, and
357  * eldest is a non-zero pointer, then fill in eldest with the cache
358  * entry to that should be reused.
359  */
360 static struct unix_cache *find_cached_block(struct unix_private_data *data,
361                                             unsigned long long block,
362                                             struct unix_cache **eldest)
363 {
364         struct unix_cache       *cache, *unused_cache, *oldest_cache;
365         int                     i;
366
367         unused_cache = oldest_cache = 0;
368         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
369                 if (!cache->in_use) {
370                         if (!unused_cache)
371                                 unused_cache = cache;
372                         continue;
373                 }
374                 if (cache->block == block) {
375                         cache->access_time = ++data->access_time;
376                         return cache;
377                 }
378                 if (!oldest_cache ||
379                     (cache->access_time < oldest_cache->access_time))
380                         oldest_cache = cache;
381         }
382         if (eldest)
383                 *eldest = (unused_cache) ? unused_cache : oldest_cache;
384         return 0;
385 }
386
387 /*
388  * Reuse a particular cache entry for another block.
389  */
390 static void reuse_cache(io_channel channel, struct unix_private_data *data,
391                  struct unix_cache *cache, unsigned long long block)
392 {
393         if (cache->dirty && cache->in_use)
394                 raw_write_blk(channel, data, cache->block, 1, cache->buf);
395
396         cache->in_use = 1;
397         cache->dirty = 0;
398         cache->block = block;
399         cache->access_time = ++data->access_time;
400 }
401
402 /*
403  * Flush all of the blocks in the cache
404  */
405 static errcode_t flush_cached_blocks(io_channel channel,
406                                      struct unix_private_data *data,
407                                      int invalidate)
408
409 {
410         struct unix_cache       *cache;
411         errcode_t               retval, retval2;
412         int                     i;
413
414         retval2 = 0;
415         for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
416                 if (!cache->in_use)
417                         continue;
418
419                 if (invalidate)
420                         cache->in_use = 0;
421
422                 if (!cache->dirty)
423                         continue;
424
425                 retval = raw_write_blk(channel, data,
426                                        cache->block, 1, cache->buf);
427                 if (retval)
428                         retval2 = retval;
429                 else
430                         cache->dirty = 0;
431         }
432         return retval2;
433 }
434 #endif /* NO_IO_CACHE */
435
436 #ifdef __linux__
437 #ifndef BLKDISCARDZEROES
438 #define BLKDISCARDZEROES _IO(0x12,124)
439 #endif
440 #endif
441
442 static errcode_t unix_open(const char *name, int flags, io_channel *channel)
443 {
444         io_channel      io = NULL;
445         struct unix_private_data *data = NULL;
446         errcode_t       retval;
447         int             open_flags, zeroes = 0;
448         int             f_nocache = 0;
449         ext2fs_struct_stat st;
450 #ifdef __linux__
451         struct          utsname ut;
452 #endif
453
454         if (name == 0)
455                 return EXT2_ET_BAD_DEVICE_NAME;
456         retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
457         if (retval)
458                 goto cleanup;
459         memset(io, 0, sizeof(struct struct_io_channel));
460         io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
461         retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
462         if (retval)
463                 goto cleanup;
464
465         io->manager = unix_io_manager;
466         retval = ext2fs_get_mem(strlen(name)+1, &io->name);
467         if (retval)
468                 goto cleanup;
469
470         strcpy(io->name, name);
471         io->private_data = data;
472         io->block_size = 1024;
473         io->read_error = 0;
474         io->write_error = 0;
475         io->refcount = 1;
476
477         memset(data, 0, sizeof(struct unix_private_data));
478         data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
479         data->io_stats.num_fields = 2;
480
481         open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
482         if (flags & IO_FLAG_EXCLUSIVE)
483                 open_flags |= O_EXCL;
484 #if defined(O_DIRECT)
485         if (flags & IO_FLAG_DIRECT_IO)
486                 open_flags |= O_DIRECT;
487 #elif defined(F_NOCACHE)
488         if (flags & IO_FLAG_DIRECT_IO)
489                 f_nocache = F_NOCACHE;
490 #endif
491         data->flags = flags;
492
493         data->dev = ext2fs_open_file(io->name, open_flags, 0);
494         if (data->dev < 0) {
495                 retval = errno;
496                 goto cleanup;
497         }
498         if (f_nocache) {
499                 if (fcntl(data->dev, f_nocache, 1) < 0) {
500                         retval = errno;
501                         goto cleanup;
502                 }
503         }
504
505         /*
506          * If the device is really a block device, then set the
507          * appropriate flag, otherwise we can set DISCARD_ZEROES flag
508          * because we are going to use punch hole instead of discard
509          * and if it succeed, subsequent read from sparse area returns
510          * zero.
511          */
512         if (ext2fs_stat(io->name, &st) == 0) {
513                 if (S_ISBLK(st.st_mode))
514                         io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
515                 else
516                         io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
517         }
518
519 #ifdef BLKSSZGET
520         if (flags & IO_FLAG_DIRECT_IO) {
521                 if (ioctl(data->dev, BLKSSZGET, &data->align) != 0)
522                         data->align = io->block_size;
523         }
524 #endif
525
526 #ifdef BLKDISCARDZEROES
527         ioctl(data->dev, BLKDISCARDZEROES, &zeroes);
528         if (zeroes)
529                 io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
530 #endif
531
532 #if defined(__CYGWIN__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
533         /*
534          * Some operating systems require that the buffers be aligned,
535          * regardless of O_DIRECT
536          */
537         data->align = 512;
538 #endif
539
540
541         if ((retval = alloc_cache(io, data)))
542                 goto cleanup;
543
544 #ifdef BLKROGET
545         if (flags & IO_FLAG_RW) {
546                 int error;
547                 int readonly = 0;
548
549                 /* Is the block device actually writable? */
550                 error = ioctl(data->dev, BLKROGET, &readonly);
551                 if (!error && readonly) {
552                         close(data->dev);
553                         retval = EPERM;
554                         goto cleanup;
555                 }
556         }
557 #endif
558
559 #ifdef __linux__
560 #undef RLIM_INFINITY
561 #if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
562 #define RLIM_INFINITY   ((unsigned long)(~0UL>>1))
563 #else
564 #define RLIM_INFINITY  (~0UL)
565 #endif
566         /*
567          * Work around a bug in 2.4.10-2.4.18 kernels where writes to
568          * block devices are wrongly getting hit by the filesize
569          * limit.  This workaround isn't perfect, since it won't work
570          * if glibc wasn't built against 2.2 header files.  (Sigh.)
571          *
572          */
573         if ((flags & IO_FLAG_RW) &&
574             (uname(&ut) == 0) &&
575             ((ut.release[0] == '2') && (ut.release[1] == '.') &&
576              (ut.release[2] == '4') && (ut.release[3] == '.') &&
577              (ut.release[4] == '1') && (ut.release[5] >= '0') &&
578              (ut.release[5] < '8')) &&
579             (ext2fs_stat(io->name, &st) == 0) &&
580             (S_ISBLK(st.st_mode))) {
581                 struct rlimit   rlim;
582
583                 rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
584                 setrlimit(RLIMIT_FSIZE, &rlim);
585                 getrlimit(RLIMIT_FSIZE, &rlim);
586                 if (((unsigned long) rlim.rlim_cur) <
587                     ((unsigned long) rlim.rlim_max)) {
588                         rlim.rlim_cur = rlim.rlim_max;
589                         setrlimit(RLIMIT_FSIZE, &rlim);
590                 }
591         }
592 #endif
593         *channel = io;
594         return 0;
595
596 cleanup:
597         if (data) {
598                 free_cache(data);
599                 ext2fs_free_mem(&data);
600         }
601         if (io)
602                 ext2fs_free_mem(&io);
603         return retval;
604 }
605
606 static errcode_t unix_close(io_channel channel)
607 {
608         struct unix_private_data *data;
609         errcode_t       retval = 0;
610
611         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
612         data = (struct unix_private_data *) channel->private_data;
613         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
614
615         if (--channel->refcount > 0)
616                 return 0;
617
618 #ifndef NO_IO_CACHE
619         retval = flush_cached_blocks(channel, data, 0);
620 #endif
621
622         if (close(data->dev) < 0)
623                 retval = errno;
624         free_cache(data);
625
626         ext2fs_free_mem(&channel->private_data);
627         if (channel->name)
628                 ext2fs_free_mem(&channel->name);
629         ext2fs_free_mem(&channel);
630         return retval;
631 }
632
633 static errcode_t unix_set_blksize(io_channel channel, int blksize)
634 {
635         struct unix_private_data *data;
636         errcode_t               retval;
637
638         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
639         data = (struct unix_private_data *) channel->private_data;
640         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
641
642         if (channel->block_size != blksize) {
643 #ifndef NO_IO_CACHE
644                 if ((retval = flush_cached_blocks(channel, data, 0)))
645                         return retval;
646 #endif
647
648                 channel->block_size = blksize;
649                 free_cache(data);
650                 if ((retval = alloc_cache(channel, data)))
651                         return retval;
652         }
653         return 0;
654 }
655
656
657 static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
658                                int count, void *buf)
659 {
660         struct unix_private_data *data;
661         struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
662         errcode_t       retval;
663         char            *cp;
664         int             i, j;
665
666         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
667         data = (struct unix_private_data *) channel->private_data;
668         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
669
670 #ifdef NO_IO_CACHE
671         return raw_read_blk(channel, data, block, count, buf);
672 #else
673         /*
674          * If we're doing an odd-sized read or a very large read,
675          * flush out the cache and then do a direct read.
676          */
677         if (count < 0 || count > WRITE_DIRECT_SIZE) {
678                 if ((retval = flush_cached_blocks(channel, data, 0)))
679                         return retval;
680                 return raw_read_blk(channel, data, block, count, buf);
681         }
682
683         cp = buf;
684         while (count > 0) {
685                 /* If it's in the cache, use it! */
686                 if ((cache = find_cached_block(data, block, &reuse[0]))) {
687 #ifdef DEBUG
688                         printf("Using cached block %lu\n", block);
689 #endif
690                         memcpy(cp, cache->buf, channel->block_size);
691                         count--;
692                         block++;
693                         cp += channel->block_size;
694                         continue;
695                 }
696                 if (count == 1) {
697                         /*
698                          * Special case where we read directly into the
699                          * cache buffer; important in the O_DIRECT case
700                          */
701                         cache = reuse[0];
702                         reuse_cache(channel, data, cache, block);
703                         if ((retval = raw_read_blk(channel, data, block, 1,
704                                                    cache->buf))) {
705                                 cache->in_use = 0;
706                                 return retval;
707                         }
708                         memcpy(cp, cache->buf, channel->block_size);
709                         return 0;
710                 }
711
712                 /*
713                  * Find the number of uncached blocks so we can do a
714                  * single read request
715                  */
716                 for (i=1; i < count; i++)
717                         if (find_cached_block(data, block+i, &reuse[i]))
718                                 break;
719 #ifdef DEBUG
720                 printf("Reading %d blocks starting at %lu\n", i, block);
721 #endif
722                 if ((retval = raw_read_blk(channel, data, block, i, cp)))
723                         return retval;
724
725                 /* Save the results in the cache */
726                 for (j=0; j < i; j++) {
727                         count--;
728                         cache = reuse[j];
729                         reuse_cache(channel, data, cache, block++);
730                         memcpy(cache->buf, cp, channel->block_size);
731                         cp += channel->block_size;
732                 }
733         }
734         return 0;
735 #endif /* NO_IO_CACHE */
736 }
737
738 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
739                                int count, void *buf)
740 {
741         return unix_read_blk64(channel, block, count, buf);
742 }
743
744 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
745                                 int count, const void *buf)
746 {
747         struct unix_private_data *data;
748         struct unix_cache *cache, *reuse;
749         errcode_t       retval = 0;
750         const char      *cp;
751         int             writethrough;
752
753         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
754         data = (struct unix_private_data *) channel->private_data;
755         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
756
757 #ifdef NO_IO_CACHE
758         return raw_write_blk(channel, data, block, count, buf);
759 #else
760         /*
761          * If we're doing an odd-sized write or a very large write,
762          * flush out the cache completely and then do a direct write.
763          */
764         if (count < 0 || count > WRITE_DIRECT_SIZE) {
765                 if ((retval = flush_cached_blocks(channel, data, 1)))
766                         return retval;
767                 return raw_write_blk(channel, data, block, count, buf);
768         }
769
770         /*
771          * For a moderate-sized multi-block write, first force a write
772          * if we're in write-through cache mode, and then fill the
773          * cache with the blocks.
774          */
775         writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
776         if (writethrough)
777                 retval = raw_write_blk(channel, data, block, count, buf);
778
779         cp = buf;
780         while (count > 0) {
781                 cache = find_cached_block(data, block, &reuse);
782                 if (!cache) {
783                         cache = reuse;
784                         reuse_cache(channel, data, cache, block);
785                 }
786                 memcpy(cache->buf, cp, channel->block_size);
787                 cache->dirty = !writethrough;
788                 count--;
789                 block++;
790                 cp += channel->block_size;
791         }
792         return retval;
793 #endif /* NO_IO_CACHE */
794 }
795
796 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
797                                 int count, const void *buf)
798 {
799         return unix_write_blk64(channel, block, count, buf);
800 }
801
802 static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
803                                  int size, const void *buf)
804 {
805         struct unix_private_data *data;
806         errcode_t       retval = 0;
807         ssize_t         actual;
808
809         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
810         data = (struct unix_private_data *) channel->private_data;
811         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
812
813         if (data->align != 0) {
814 #ifdef ALIGN_DEBUG
815                 printf("unix_write_byte: O_DIRECT fallback\n");
816 #endif
817                 return EXT2_ET_UNIMPLEMENTED;
818         }
819
820 #ifndef NO_IO_CACHE
821         /*
822          * Flush out the cache completely
823          */
824         if ((retval = flush_cached_blocks(channel, data, 1)))
825                 return retval;
826 #endif
827
828         if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
829                 return errno;
830
831         actual = write(data->dev, buf, size);
832         if (actual != size)
833                 return EXT2_ET_SHORT_WRITE;
834
835         return 0;
836 }
837
838 /*
839  * Flush data buffers to disk.
840  */
841 static errcode_t unix_flush(io_channel channel)
842 {
843         struct unix_private_data *data;
844         errcode_t retval = 0;
845
846         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
847         data = (struct unix_private_data *) channel->private_data;
848         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
849
850 #ifndef NO_IO_CACHE
851         retval = flush_cached_blocks(channel, data, 0);
852 #endif
853         fsync(data->dev);
854         return retval;
855 }
856
857 static errcode_t unix_set_option(io_channel channel, const char *option,
858                                  const char *arg)
859 {
860         struct unix_private_data *data;
861         unsigned long long tmp;
862         char *end;
863
864         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
865         data = (struct unix_private_data *) channel->private_data;
866         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
867
868         if (!strcmp(option, "offset")) {
869                 if (!arg)
870                         return EXT2_ET_INVALID_ARGUMENT;
871
872                 tmp = strtoull(arg, &end, 0);
873                 if (*end)
874                         return EXT2_ET_INVALID_ARGUMENT;
875                 data->offset = tmp;
876                 if (data->offset < 0)
877                         return EXT2_ET_INVALID_ARGUMENT;
878                 return 0;
879         }
880         return EXT2_ET_INVALID_ARGUMENT;
881 }
882
883 #if defined(__linux__) && !defined(BLKDISCARD)
884 #define BLKDISCARD              _IO(0x12,119)
885 #endif
886
887 static errcode_t unix_discard(io_channel channel, unsigned long long block,
888                               unsigned long long count)
889 {
890         struct unix_private_data *data;
891         __uint64_t      range[2];
892         int             ret;
893
894         EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
895         data = (struct unix_private_data *) channel->private_data;
896         EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
897
898         if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
899 #ifdef BLKDISCARD
900                 range[0] = (__uint64_t)(block) * channel->block_size;
901                 range[1] = (__uint64_t)(count) * channel->block_size;
902
903                 ret = ioctl(data->dev, BLKDISCARD, &range);
904 #else
905                 goto unimplemented;
906 #endif
907         } else {
908 #ifdef FALLOC_FL_PUNCH_HOLE
909                 /*
910                  * If we are not on block device, try to use punch hole
911                  * to reclaim free space.
912                  */
913                 ret = fallocate(data->dev,
914                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
915                                 (off_t)(block) * channel->block_size,
916                                 (off_t)(count) * channel->block_size);
917 #else
918                 goto unimplemented;
919 #endif
920         }
921         if (ret < 0) {
922                 if (errno == EOPNOTSUPP)
923                         goto unimplemented;
924                 return errno;
925         }
926         return 0;
927 unimplemented:
928         return EXT2_ET_UNIMPLEMENTED;
929 }