#!/usr/bin/perl -w # $Id: ufile,v 1.2 2002/05/21 20:29:12 jason Exp jason $ # This takes sha1sum(1), sha(1) or md5sum(1) input and prints filenames of # unique files to stdout. Use -v (like grep(1)) to get non-unique files # instead, and -a to get all fields, not just the filename. # FUTURE: accept md5(1), openssl(1), gpg(1), and digest(1) input? use Getopt::Std; require "flush.pl"; $count = 0; $count_unique = 0; $print_dupes = 0; $print_all = 0; %opts = (); getopts ('va', \%opts) || die "$!"; if (defined ($opts{'v'})) { $print_dupes = 1; } if (defined ($opts{'a'})) { $print_all = 1; } while () { next if (/^#/); # skip comments $count++; @vals = split; if ($#vals == 2) { # sha1sum(1) format, hash + size + filename # NB: We might miss some matches if the filesizes aren't _properly_ # reported, but at least we won't blindly fall prey to preimage # attacks. $hash = $vals[0] . "\t" . $vals[1]; $file = $vals[2]; } else { # lesser formats, hash + filename $hash = $vals[0]; $file = $vals[1]; } if ($print_all) { chomp; $output = $_; } else { $output = $file; } if (! exists ($hashes{$hash})) { # use first file, not last... $hashes{$hash} = $file; $count_unique++; print $output, "\n" if (!$print_dupes); } elsif ($print_dupes) { print $output, "\n"; } } flush (STDOUT); print STDERR "info: $count_unique unique file(s), $count total (", $count - $count_unique, " duplicate(s)).\n";