#!/usr/bin/perl =head1 NAME lndir-dupes - reduce disk usage by linking identical files =head1 SYNOPSIS lndir-dupes dir1 [...] =head1 DESCRIPTION This script will descend one or more directory trees provided on the command line, and will hard-link all identical files found, of sizes greater than 1024 bytes, to each other. =head1 PERFORMANCE "lndir-dupes", run on over 200 GB of backups with lots of duplicated files, took over a day to complete, using up 620 MB of temporary storage and hit a max of 24 MB memory usage, with 18 MB resident. =head1 VERSION 1.1 Jan 3 2009 jm =head1 LICENSE Same as Perl itself =head1 AUTHOR Justin Mason Bits from http://stuff.mit.edu/afs/athena/contrib/perl/scripts/nutshell/ch6/lndir =cut # TODO: deal with maxlinks; too many links to 1 inode use strict; use warnings; use File::Find; use File::Temp; # Check args && fetch names. my $o_dryrun = 0; my $o_ignore_uidgid = 0; my $o_list_different = 0; if ($ARGV[0] && $ARGV[0] eq '-n') { $o_dryrun = 1; shift; } my @dirs = @ARGV; scalar @dirs or die "Usage: $0 dir1 [...]\n"; my $MIN_SIZE = 1024; # don't bother with smaller files my $files_count; my $files_kb; my $tmpdir = File::Temp::tempdir( CLEANUP => 1 ); my $NEWLINE_SYMBOL = "\0\0\0[[NEWLINE]]\0\0\0"; sub wanted { return unless (-f $_); my $n = $File::Find::name; my ($ndev,$nino,$nmode,$nnlink,$nuid,$ngid,$nrdev,$nsize, $natime,$nmtime,$nctime,$nblksize,$nblocks) = stat($n); return unless (defined $nsize && $nsize > $MIN_SIZE); add_file_by_size($nsize, $n); $files_count++; $files_kb += int ($nsize / 1000); } foreach my $dir (@dirs) { $files_count = 0; $files_kb = 0; File::Find::find ({ no_chdir => 1, wanted => \&wanted }, $dir); print "$dir: $files_count files, $files_kb KiB\n"; } my $files_linked_count = 0; my $files_linked_kb = 0; my $files_examined = 0; foreach my $size (sort {$b<=>$a} all_known_file_sizes()) { my @f = get_all_files_of_size($size); foreach my $oldfile (@f) { foreach my $newfile (@f) { next if $oldfile eq $newfile; if (link_if_possible($oldfile, $newfile)) { $files_linked_count++; $files_linked_kb += int ((-s $oldfile) / 1000); } } $files_examined++; progress($files_examined, $files_linked_count, $files_count); } delete_file_size_list($size); # free this up, no longer needed } print "new links created: $files_linked_count files, $files_linked_kb KiB\n"; exit; sub progress { my ($done, $linked, $total) = @_; if ($done % 1000 == 0) { print "progress: $done/$total, linked $linked\n"; } } sub dbg; sub link_if_possible { my ($old, $new) = @_; # Get stat info for both files. my ($ndev,$nino,$nmode,$nnlink,$nuid,$ngid,$nrdev,$nsize, $natime,$nmtime,$nctime,$nblksize,$nblocks) = stat($new); unless ($nino) { print "$new: $!\n"; return; } unless (-f _) { warn "$new is not a plain file\n"; return; } my ($odev,$oino,$omode,$onlink,$ouid,$ogid,$ordev,$osize, $oatime,$omtime,$octime,$oblksize,$oblocks) = stat($old); unless ($oino) { return; } unless (-f _) { warn "$old is not a plain file\n"; return; } # Quick check on size and mode. if ($nsize != $osize) { is_different($old, $new, "size is different"); return; } if (!$o_ignore_uidgid) { if ($nmode != $omode) { is_different($old, $new, "mode is different"); return; } if ($ngid != $ogid) { is_different($old, $new, "gid is different"); return; } if ($nuid != $ouid) { is_different($old, $new, "uid is different"); return; } } # Already linked? (Perhaps symbolically?) # Compare dev/inode numbers. if ($ndev == $odev && $nino == $oino) { dbg "$new already linked\n"; return; } # Now compare the two files. unless (open(NEW,"$new")) { print "$new: $!\n"; return; } unless (open(OLD,"$old")) { print "$old: $!\n"; return; } my $blksize = $nblksize || 8192; my ($nbuf, $obuf); while (read(OLD,$obuf,$blksize)) { read(NEW,$nbuf,$blksize); if ($obuf ne $nbuf) { is_different($old, $new, "differs"); return; } } # Okay, let's link. if ($o_dryrun) { if (!$o_list_different) { print "ln $new $old\n"; } return 1; } if (unlink($new) && link($old, $new)) { print " ln: $new $old\n"; return 1; } print "$new: $!\n"; return; } sub dbg { # print @_; } sub is_different { my ($old, $new, $msg) = @_; if ($o_dryrun && $o_list_different) { print "diff: $old $new\n"; } dbg($new." ".$msg."\n"); } sub add_file_by_size { my ($size, $path) = @_; open OUT, ">>$tmpdir/$size" or die "cannot open $tmpdir: $!"; $path =~ s/\n/${NEWLINE_SYMBOL}/gs; # otherwise we screw up the file format print OUT $path."\n"; close OUT or die "close failed on $tmpdir/$size: $!"; } sub all_known_file_sizes { opendir DIR, $tmpdir or die; my @files = grep { /^[0-9]/ } readdir(DIR); closedir DIR; return @files; } sub get_all_files_of_size { my ($size) = @_; open IN, "<$tmpdir/$size" or die "cannot read $tmpdir/$size: $!"; my @all = (); while () { chop; s/${NEWLINE_SYMBOL}/\n/gs; push @all, $_; } close IN; return @all; } sub delete_file_size_list { my ($size) = @_; unlink "$tmpdir/$size" or warn "unlink failed $tmpdir/$size: $!"; }