#!/usr/bin/perl
=head1 NAME

lndir-dupes - reduce disk usage by linking identical files

=head1 SYNOPSIS

lndir-dupes dir1 [...]

=head1 DESCRIPTION

This script will descend one or more directory trees provided on the command
line, and will hard-link all identical files found, of sizes greater than 1024 bytes,
to each other.

=head1 PERFORMANCE

"lndir-dupes", run on over 200 GB of backups with lots of duplicated files,
took over a day to complete, using up 620 MB of temporary storage and hit a
max of 24 MB memory usage, with 18 MB resident.

=head1 VERSION

1.1 Jan  3 2009 jm 

=head1 LICENSE

Same as Perl itself

=head1 AUTHOR

Justin Mason

Bits from http://stuff.mit.edu/afs/athena/contrib/perl/scripts/nutshell/ch6/lndir

=cut

# TODO: deal with maxlinks; too many links to 1 inode

use strict;
use warnings;
use File::Find;
use File::Temp;

# Check args && fetch names.

my $o_dryrun = 0;
my $o_ignore_uidgid = 0;
my $o_list_different = 0;

if ($ARGV[0] && $ARGV[0] eq '-n') {
  $o_dryrun = 1; shift;
}

my @dirs = @ARGV;
scalar @dirs or die "Usage: $0 dir1 [...]\n";


my $MIN_SIZE = 1024;        # don't bother with smaller files
my $files_count;
my $files_kb;
my $tmpdir = File::Temp::tempdir( CLEANUP => 1 );
my $NEWLINE_SYMBOL = "\0\0\0[[NEWLINE]]\0\0\0";

sub wanted {
  return unless (-f $_);

  my $n = $File::Find::name;
  my ($ndev,$nino,$nmode,$nnlink,$nuid,$ngid,$nrdev,$nsize,
    $natime,$nmtime,$nctime,$nblksize,$nblocks) = stat($n);
  return unless (defined $nsize && $nsize > $MIN_SIZE);

  add_file_by_size($nsize, $n);
  $files_count++;
  $files_kb += int ($nsize / 1000);
}

foreach my $dir (@dirs) {
  $files_count = 0;
  $files_kb = 0;
  File::Find::find ({ no_chdir => 1, wanted => \&wanted }, $dir);
  print "$dir: $files_count files, $files_kb KiB\n";
}

my $files_linked_count = 0;
my $files_linked_kb = 0;
my $files_examined = 0;

foreach my $size (sort {$b<=>$a} all_known_file_sizes()) {
  my @f = get_all_files_of_size($size);
  foreach my $oldfile (@f) {
    foreach my $newfile (@f) {
      next if $oldfile eq $newfile;
      if (link_if_possible($oldfile, $newfile)) {
        $files_linked_count++;
        $files_linked_kb += int ((-s $oldfile) / 1000);
      }
    }
    $files_examined++;
    progress($files_examined, $files_linked_count, $files_count);
  }
  delete_file_size_list($size);  # free this up, no longer needed
}

print "new links created: $files_linked_count files, $files_linked_kb KiB\n";
exit;

sub progress {
  my ($done, $linked, $total) = @_;

  if ($done % 1000 == 0) {
    print "progress: $done/$total, linked $linked\n";
  }
}

sub dbg;
sub link_if_possible {
  my ($old, $new) = @_;

  # Get stat info for both files.
  my ($ndev,$nino,$nmode,$nnlink,$nuid,$ngid,$nrdev,$nsize,
    $natime,$nmtime,$nctime,$nblksize,$nblocks)
      = stat($new);

  unless ($nino) {
    print "$new: $!\n";
    return;
  }

  unless (-f _) {
    warn "$new is not a plain file\n";
    return;
  }

  my ($odev,$oino,$omode,$onlink,$ouid,$ogid,$ordev,$osize,
    $oatime,$omtime,$octime,$oblksize,$oblocks)
      = stat($old);

  unless ($oino) {
    return;
  }

  unless (-f _) {
    warn "$old is not a plain file\n";
    return;
  }

  # Quick check on size and mode.

  if ($nsize != $osize) {
    is_different($old, $new, "size is different");
    return;
  }

  if (!$o_ignore_uidgid) {
    if ($nmode != $omode) {
      is_different($old, $new, "mode is different");
      return;
    }

    if ($ngid != $ogid) {
      is_different($old, $new, "gid is different");
      return;
    }

    if ($nuid != $ouid) {
      is_different($old, $new, "uid is different");
      return;
    }
  }

  # Already linked?  (Perhaps symbolically?)
  # Compare dev/inode numbers.

  if ($ndev == $odev && $nino == $oino) {
    dbg "$new already linked\n";
    return;
  }

  # Now compare the two files.

  unless (open(NEW,"$new")) {
    print "$new: $!\n";
    return;
  }
  unless (open(OLD,"$old")) {
    print "$old: $!\n";
    return;
  }
  my $blksize = $nblksize || 8192;
  my ($nbuf, $obuf);
  while (read(OLD,$obuf,$blksize)) {
    read(NEW,$nbuf,$blksize);
    if ($obuf ne $nbuf) {
      is_different($old, $new, "differs");
      return;
    }
  }

  # Okay, let's link.

  if ($o_dryrun) {
    if (!$o_list_different) {
      print "ln $new $old\n";
    }
    return 1;
  }

  if (unlink($new) && link($old, $new)) {
    print "  ln: $new $old\n";
    return 1;
  }

  print "$new: $!\n";
  return;
}

sub dbg {
  # print @_;
}

sub is_different {
  my ($old, $new, $msg) = @_;
  if ($o_dryrun && $o_list_different) {
    print "diff: $old $new\n";
  }
  dbg($new." ".$msg."\n");
}

sub add_file_by_size {
  my ($size, $path) = @_;
  open OUT, ">>$tmpdir/$size" or die "cannot open $tmpdir: $!";
  $path =~ s/\n/${NEWLINE_SYMBOL}/gs;     # otherwise we screw up the file format
  print OUT $path."\n";
  close OUT or die "close failed on $tmpdir/$size: $!";
}

sub all_known_file_sizes {
  opendir DIR, $tmpdir or die;
  my @files = grep { /^[0-9]/ } readdir(DIR);
  closedir DIR;
  return @files;
}

sub get_all_files_of_size {
  my ($size) = @_;
  open IN, "<$tmpdir/$size" or die "cannot read $tmpdir/$size: $!";
  my @all = ();
  while (<IN>) {
    chop; s/${NEWLINE_SYMBOL}/\n/gs; push @all, $_;
  }
  close IN;
  return @all;
}

sub delete_file_size_list {
  my ($size) = @_;
  unlink "$tmpdir/$size" or warn "unlink failed $tmpdir/$size: $!";
}