sparsify - utility to punch out blocks of 0s in a file

To: ext4 development <linux-ext4@xxxxxxxxxxxxxxx>, xfs-oss <xfs@xxxxxxxxxxx>
Subject: sparsify - utility to punch out blocks of 0s in a file
From: Eric Sandeen <sandeen@xxxxxxxxxx>
Date: Sat, 04 Feb 2012 14:04:00 -0600
Now that ext4, xfs, & ocfs2 can support punch hole, a tool to
"re-sparsify" a file by punching out ranges of 0s might be in order.

I whipped this up fast, it probably has bugs & off-by-ones but thought
I'd send it out.  It's not terribly efficient doing 4k reads by default
I suppose.

I'll see if util-linux wants it after it gets beat into shape.
(or did a tool like this already exist and I missed it?)

(Another mode which does a file copy, possibly from stdin
might be good, like e2fsprogs/contrib/make-sparse.c ?  Although
that can be hacked up with cp already).

It works like this:

[root@inode sparsify]# ./sparsify  -h
Usage: sparsify [-m min hole size] [-o offset] [-l length] filename

[root@inode sparsify]# dd if=/dev/zero of=fsfile bs=1M count=512
[root@inode sparsify]# mkfs.xfs fsfile >/dev/null
[root@inode sparsify]# du -hc fsfile
512M    fsfile
512M    total
[root@inode sparsify]# ./sparsify fsfile
punching out holes of minimum size 4096 in range 0-536870912
[root@inode sparsify]# du -hc fsfile
129M    fsfile
129M    total
[root@inode sparsify]# xfs_repair fsfile
Phase 1 - find and verify superblock...
Phase 7 - verify and correct link counts...
[root@inode sparsify]# echo $?
[root@inode sparsify]# 

 * sparsify - utility to punch out blocks of 0s in a file
 * Copyright (C) 2011 Red Hat, Inc. All rights reserved.
 * Written by Eric Sandeen <sandeen@xxxxxxxxxx>
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation.
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <ctype.h>
#include <string.h>

#include <linux/falloc.h>

#define FALLOC_FL_PUNCH_HOLE    0x02 /* de-allocates range */

void usage(void)
        printf("Usage: sparsify [-m min hole size] [-o offset] [-l length] 

#define EXABYTES(x)     ((long long)(x) << 60)
#define PETABYTES(x)    ((long long)(x) << 50)
#define TERABYTES(x)    ((long long)(x) << 40)
#define GIGABYTES(x)    ((long long)(x) << 30)
#define MEGABYTES(x)    ((long long)(x) << 20)
#define KILOBYTES(x)    ((long long)(x) << 10)

#define __round_mask(x, y) ((__typeof__(x))((y)-1))
#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
#define round_down(x, y) ((x) & ~__round_mask(x, y))

int debug;

long long
cvtnum(char *s)
        long long       i;
        char            *sp;
        int             c;

        i = strtoll(s, &sp, 0);
        if (i == 0 && sp == s)
                return -1LL;
        if (*sp == '\0')
                return i;
        if (sp[1] != '\0')
                return -1LL;

        c = tolower(*sp);
        switch (c) {
        case 'k':
                return KILOBYTES(i);
        case 'm':
                return MEGABYTES(i);
        case 'g':
                return GIGABYTES(i);
        case 't':
                return TERABYTES(i);
        case 'p':
                return PETABYTES(i);
        case 'e':
                return  EXABYTES(i);

        return -1LL;

int punch_hole(int fd, off_t offset, off_t len)
        int error = 0;

        if (debug)
                printf("punching at %lld len %lld\n", offset, len);
        //error = fallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
        //                offset, len);
        if (error < 0) {
                perror("punch failed");

int main(int argc, char **argv)
        int     fd;
        char    *fname;
        int     opt;
        loff_t  min_hole = 0;
        loff_t  punch_range_start = 0;
        loff_t  punch_range_len = 0;
        loff_t  punch_range_end = 0;
        loff_t  cur_offset = 0;
        unsigned long blocksize;
        struct statvfs statvfsbuf;
        struct stat statbuf;
        ssize_t ret;
        off_t   punch_offset, punch_len;
        char    *readbuf, *zerobuf;

        while ((opt = getopt(argc, argv, "m:l:o:vh")) != -1) {
                switch(opt) {
                case 'm':
                        min_hole = cvtnum(optarg);
                case 'o':
                        punch_range_start = cvtnum(optarg);
                case 'l':
                        punch_range_len = cvtnum(optarg);
                case 'v':
                case 'h':

        if (min_hole < 0) {
                printf("Error: invalid min hole value specified\n");

        if (punch_range_len < 0) {
                printf("Error: invalid length value specified\n");

        if (punch_range_start < 0) {
                printf("Error: invalid offset value specified\n");

        if (optind == argc) {
                printf("Error: no filename specified\n");

        fname = argv[optind++];

        fd = open(fname, O_RDWR);
        if (fd < 0) {
                perror("Error opening file");

        if (fstat(fd, &statbuf) < 0) {
                perror("Error stat-ing file");

        if (fstatvfs(fd, &statvfsbuf) < 0) {
                perror("Error stat-ing fs");

        blocksize = statvfsbuf.f_bsize;
        if (debug)
                printf("blocksize is %lu\n", blocksize);

        /* default range end is end of file */
        if (!punch_range_len)
                punch_range_end = statbuf.st_size;
                punch_range_end = punch_range_start + punch_range_len;

        if (punch_range_end > statbuf.st_size) {
                printf("Error: range extends past EOF\n");

        if (debug)
                printf("orig start/end %lld/%lld/%lld\n", punch_range_start, 
punch_range_end, min_hole);

         * Normalize to blocksize-aligned range:
         * round start down, round end up - get all blocks including the range 

        punch_range_start = round_down(punch_range_start, blocksize);
        punch_range_end = round_up(punch_range_end, blocksize);
        min_hole = round_up(min_hole, blocksize);
        if (!min_hole)
                min_hole = blocksize;

        if (debug)
                printf("new start/end/min %lld/%lld/%lld\n", punch_range_start, 
punch_range_end, min_hole);

        if (punch_range_end <= punch_range_start) {
                printf("Range too small, nothing to do\n");

        readbuf = malloc(min_hole);
        zerobuf = malloc(min_hole);

        if (!readbuf || !zerobuf) {
                perror("buffer allocation failed");

        memset(zerobuf, 0, min_hole);

        punch_offset = -1;
        punch_len = 0;

        /* Move to the start of our requested range */
        if (punch_range_start)
                lseek(fd, punch_range_start, SEEK_SET);
        cur_offset = punch_range_start;

        printf("punching out holes of minimum size %lld in range %lld-%lld\n",
                min_hole, punch_range_start, punch_range_end);

         * Read through the file, finding block-aligned regions of 0s.
         * If the region is at least min_hole, punch it out.
         * This should be starting at a block-aligned offset

        while ((ret = read(fd, readbuf, min_hole)) > 0) {

                if (!memcmp(readbuf, zerobuf, min_hole)) {
                        /* Block of zeros, so extend punch range */
                        if (punch_offset < 0)
                                punch_offset = cur_offset;
                        punch_len += min_hole;
                        if (debug > 1)
                                printf("found zeros at %lld, hole len now 
%lld\n", cur_offset, punch_len);
                } else if (punch_offset > 0) {
                        /* Found nonzero byte; punch accumulated hole if it's 
big enough */
                        if (punch_len >= min_hole)
                                punch_hole(fd, punch_offset, punch_len);
                        else if (debug > 1)
                                printf("skipping hole of insufficient size 
%lld\n", punch_len);

                        /* reset punch range */
                        punch_offset = -1;
                        punch_len = 0;

                cur_offset += ret;
                /* Quit if we've moved beyond the specified range to punch */
                if (cur_offset >= punch_range_end) {
                        /* punch out last hole in range if needed */
                        if (punch_offset > 0 && punch_len >= min_hole)
                                punch_hole(fd, punch_offset, punch_len);

        if (ret < 0) {
                perror("read failed");

        return 0;

