#!/usr/bin/perl -w =head1 NAME find-hidden-word-text - find hidden text in MS Word documents =head1 SYNOPSIS find-hidden-word-text word.doc > hidden.txt =head1 DESCRIPTION This is a command-line UNIX tool to ease the task of discovering hidden text in MS Word documents. More specifically, it is an implementation of Method 2 from Simon Byers' paper, _Scalable Exploitation of, and Responses to Information Leakage Through Hidden Data in Published Documents_, at . This goes a little further in that it removes some common 'noise' strings, like 'Word.Document.8', 'Title', 'PAGE', 'Microsoft Word Document' and the like. It will also remove any strings that do not contain at least 1 whitespace character. =head1 PREREQUISITES This tool requires antiword be installed. =head1 AUTHOR Justin Mason, C =head1 VERSION 1.0 Aug 15 2003 jm =cut my $print_names = 0; if (scalar @ARGV > 1) { $print_names = 1; } foreach my $file (@ARGV) { if ($print_names) { print "\n$file\n\n"; } open (IN, "antiword -t $file |") or die "cannot run antiword"; my $aw = join ('', ); close IN or die "cannot run antiword -t $file"; open (IN, "strings $file |") or die "cannot run strings"; my $str = join ('', ); close IN; # normalize the antiword version $aw =~ s/\s+/ /gs; # get each string from strings, and see if we can find it in the "visible" # text from antiword my %count = (); foreach (split (/\n/, $str)) { s/\s+/ /g; s/^ //gs; s/ $//gs; next if ($aw =~ /\Q$_\E/); # killfile. # skip almost-entirely non-alpha 4-byte snippets #next if /^(?:\W\w\W\W|\W\W\w\W|\w\W{3,3}|\W{4,4}|\W{3,3}\w)$/; next if (!/ /); # no spaces! # skip 4-to-6-byte snippets with 1 nonalpha and no spaces #next if (/^\S{4,6}$/ && /\W/); # common word droppings next if /^\s*PAGE\s*$/; #next if /^Word.Document.\d$/; next if /^Microsoft Word 9.0$/; next if /^Microsoft Word Document$/; #next if /^Normal$/; #next if /^Title$/; #next if /^MSWordDoc$/; next if /^Click to edit Master text styles$/; next if /^Click to edit Master title style$/; next if /^Embedded OLE Servers$/; $count{$_}++; } # output the strings and their counts foreach (sort keys %count) { my $cnt = $count{$_}; print "$cnt|$_\n"; } }