#!/usr/bin/perl -w -s

=head1 NAME

link-same - hard link files with same content

=head1 SYNOPSIS

  link-same [-size=min_zize] new [old]

=head1 DESCRIPTION

Hardlinks files in old to files in new if content is the same.
If old is omitted: Find files with same content and hardlink those

-size sets the minimum size of files to consider. This is usefull for
avoiding a gazillion hardlinks to the empty file.

=head1 EXAMPLE

link-same public_html private_html

Hardlink all the common files in public_html and private_html.

link-same public_html

Hardlink all files in public_html that have the same content to save
diskspace.

=head1 BUGS

The owner and permissions will only be preserved if the owner of the
files is the same. Otherwise the file will be owned by one of the
original owners.

=head1 AUTHOR

Ole Tange, http://ole.tange.dk

=head1 LICENSE

GPL http://www.gnu.org/copyleft/gpl.html

=head1 SEE ALSO

ln(1).

=cut

$debug=1;
use Digest::MD5;

$srcdir=shift || die("Usage: $0 old [new]");
if($dstdir=shift) {
    two_dir_compare($srcdir,$dstdir);    
} else {
    one_dir_compare($srcdir);
}

sub one_dir_compare {
    my $srcdir = shift; # start of source dirtree
    examine_one_dir($srcdir);
}

sub examine_one_dir {
    my $srcdir = shift; # start of source dirtree
    my $subdir = shift || "."; # . if current dir

    print $srcdir,"\n" if $debug;
    # Skip symbolic link to avoid endless recursion
    -l $srcdir and return;
    -d "$srcdir/$subdir" || die("$srcdir/$subdir is not a dir");
    opendir(DIR,"$srcdir/$subdir") || die;
    # for each file in dir (except . and ..)
    for $file (grep { not /^\.\.?$/ } readdir DIR) {
        my $srcfile="$srcdir/$subdir/$file";
        print $srcfile,"\n" if $debug;
        # Skip symbolic link to avoid endless recursion
        -l $srcfile and next;
        if(-d $srcfile) { # Or filehandle: _
            # if directory: Recurse through that
            examine_one_dir($srcdir,"$subdir/$file");
            next;
        } elsif (-f $srcfile) { # Or filehandle: _
            # if regular file: check size
	    if($size) {
		if (-s $srcfile < $size) { next; }
	    }
	    # and check MD5
	    my $md5 = md5_of($srcfile);
	    if($known_md5{$md5}) {
		unlink $srcfile || die;
		link ($known_md5{$md5},$srcfile);
	    } else {
		$known_md5{$md5}=$srcfile;
	    }
            next;
        };
    }
}

sub two_dir_compare {
    my $srcdir = shift; # start of source dirtree
    my $dstdir = shift; # start of dest dirtree

    if(join(':',stat($srcdir)) eq join(':',stat($dstdir))) {
	die("$srcdir == $dstdir");
    }
    examine_two_dirs($srcdir,$dstdir);
}

sub examine_two_dirs {
    my $srcdir = shift; # start of source dirtree
    my $dstdir = shift; # start of dest dirtree
    my $subdir = shift || "."; # . if current dir

    # Skip symbolic link to avoid endless recursion
    -l $srcdir and return;
    -d "$srcdir/$subdir" || die("$srcdir/$subdir is not a dir");
    opendir(DIR,"$srcdir/$subdir") || die;
    # for each file in dir (except . and ..)
    for $file (grep { not /^\.\.?$/ } readdir DIR) {
        my $srcfile="$srcdir/$subdir/$file";
        print $srcfile,"\n" if $debug;
        # Skip symbolic link to avoid endless recursion
        -l $srcfile and next;
        if(-d $srcfile) { # Or filehandle: _
            # if directory: Recurse through that
            examine_two_dirs($srcdir,$dstdir,"$subdir/$file");
            next;
        } elsif (-f $srcfile) { # Or filehandle: _
            # if regular file: check MD5
            my $dstfile="$dstdir/$subdir/$file";
            # Skip symbolic link to avoid hardlinking to a symlink
            -l $dstfile and next;
            -f $dstfile and do {
                if(same_md5($srcfile,$dstfile)) {
                    # Hardlink
                    unlink $dstfile || die;
                    link ($srcfile,$dstfile);
                }
            };
            next;
        };
    }
}

sub md5_of {
    my $srcfile = shift;
    my $srcmd5 = Digest::MD5->new;
    open(SRCFILE,"$srcfile\0") # perl does not like files ending in " "
	|| die;
    $srcmd5->addfile(*SRCFILE);
    close SRCFILE;
    return($srcmd5->digest);
}

sub same_md5 {
    my $srcfile = shift;
    my $dstfile = shift;
    return(md5_of($srcfile) eq md5_of($dstfile));
}
