#!/usr/bin/perl # # Duplicate file finder # # This will scan a directory (. unless you change it) and use # perl's Digest::MD5 module to determine if two identically # named files are actually identical. # # Optionally you can have it ignore the name comparison and # run all files against all files. That could be very slow. # # I use this for my photos folder to keep dupes to a minimum. # # USAGE: # # perl find-dupes > removecommands.sh # # This will create a list of rm commands to delete any files # found to be duplicates along with a comment showing which # file it is identical to. After reviewing the list, you can # either manually delete the dupes or # # sh removecommands.sh # # That will then remove the appropriate files provided you did # not change directory in between the two commands. # # # You may feel free to use it for personal use only under the terms of the GNU GPL # a copy of which can be found at http://gnu.org/copyleft/gpl.html # # Copyright 2005 Matthew Steven http://www.matts.org/ use strict; use Digest::MD5 qw(md5_hex); my $debug=0; my $matchnamesonly=1; my $fr=`find . -iname "*.jpg" -o -iname "*.jpeg" -o -iname "*.png"`; # Or, if you want to find every sort of file switch to # my $fr=`find . -type f`; my @flist=split("\n",$fr); my @maybedupes; if($matchnamesonly){ my %nameseen; foreach my $file (@flist){ my $name=$file; $name =~ s:^.*\/([^\/]+)$:\1:; push @maybedupes, $file if($nameseen{$name}); $nameseen{$name}++; } }else{ @maybedupes=@flist; } print STDERR "Found ".@maybedupes." possible files to check...\n"; sleep 3; my %certains; my %mds; foreach my $poss (@maybedupes){ print STDERR "Testing $poss ...\n"; next if($certains{$poss}); my $name=$poss; $name =~ s:^.*\/([^\/]+)$:\1:; foreach my $ft(@flist){ if($ft ne $poss){ #dont want to test a file against itself next if($matchnamesonly && $ft !~ /$name$/); my $tft=quotemeta($ft);my $tposs=quotemeta($poss); $mds{$ft}=md5_hex `cat $tft` unless($mds{$ft}); $mds{$poss}=md5_hex `cat $tposs` unless($mds{$poss}); $certains{$ft}=$poss if($mds{$ft} eq $mds{$poss}); } } } foreach my $k (keys(%certains)){ # I wanted to give removal preference to files in my 'Unsorted' folder if($k =~ /nsorted/){ print "rm -f ".quotemeta($k)." # dupe of $certains{$k}\n"; }else{ print "rm -f ".quotemeta($certains{$k})." # dupe of $k\n"; } print STDERR "$k is a dupe of $certains{$k}\n"; }