#!/usr/bin/perl =head1 NAME new-referrer-rss - generate RSS feed of new referrer URLs from access_log =head1 SYNOPSIS new-referrers-rss nameofsite [source ...] > new-referrers.xml source: access_log files or directory containing same. 'new-referrers.xml' should be at a web-visible location. =head1 DESCRIPTION Given the name of a web site, and a selection of Apache combined log format 'access_log' files containing referrer URL data, this will generate an RSS feed containing the latest referrers. The script should be run periodically with 'fresh' access_log data, from cron. A file called 'hist' in the current directory is created to hold historical context information; using this, if a URL is listed in the RSS output, it will not be listed again. (As a result, subscribers should ensure that they do not update less frequently than the cron job executes!) =head1 EXAMPLE new-referrers-rss taint.org /var/log/apache/taint.org/20060* \ > scraped/referrers-taint.org.xml =head1 PREREQUISITES This tool requires the following CPAN modules: XML::RSS URI::Escape Time::Local =head1 AUTHOR Justin Mason, C =head1 SEE ALSO http://taint.org/2006/05/09/231507a.html for comments =head1 VERSION - 1.2 May 14 2006 jm: also exclude "/" - 1.1 May 10 2006 jm: put full URL into 'link' area, consider 3xx HTTP response codes as valid - 1.0 May 9 2006 jm: initial rev =cut ########################################################################### my $site = shift @ARGV; die " usage: new-referrers-rss nameofsite [access_log files or dir ...] > o.xml " unless $site; my $EXCLUDE_RE = qr{http://(?:www\.)?${site}(?:/|$)}i; my $MIN_COUNT = 5; # always list referrer sites that are new in this time period my $ALWAYS_LIST_NEWER_THAN = 1 * (24 * 60 * 60); my $IGNORE_OLDER_THAN = 30 * (24 * 60 * 60); my $IGNORE_IMAGE_EMBEDS = 1; ########################################################################### use File::Find; use XML::RSS; use URI::Escape; use Time::Local; use POSIX qw(strftime); use strict; use warnings; my %count = (); my %details = (); my %when = (); $ALWAYS_LIST_NEWER_THAN = time - $ALWAYS_LIST_NEWER_THAN; $IGNORE_OLDER_THAN = time - $IGNORE_OLDER_THAN; my %mon2mm = ( Jan => 0, Feb => 1, Mar => 2, Apr => 3, # note 0 based May => 4, Jun => 5, Jul => 6, Aug => 7, Sep => 8, Oct => 9, Nov => 10, Dec => 11 ); my $siteurl = "http://".$site; foreach my $dir (@ARGV) { File::Find::find (\&wanted, $dir); } sub wanted { return unless (-f $_); return unless open (IN, "<$_"); while () { # 71.235.13.232 - - [01/Apr/2006:00:01:10 +0100] "GET # /wp-content/themes/jmason/images/bg.gif HTTP/1.1" 200 992 # "http://taint.org/2004/11/" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT # 5.1; .NET CLR 1.1.4322)" /^\S+ \S+ \S+ \[([^\]]+?)\] "\S+ ([^"]+?) \S+" (\d+) \S+ "([^"]+?)"/ or next; my $when = $1; my $tgt = $2; my $code = $3; my $refr = $4; next if ($code !~ /^[23]/); # ignore error codes please next unless ($refr =~ /^http/); next if ($refr =~ m{^$EXCLUDE_RE}); $when =~ /^(\d\d)\/(...)\/(\d\d\d\d):(\d\d):(\d\d):(\d\d) /; $when = timegm($6,$5,$4,$1,$mon2mm{$2},$3) || 0; next if ($when < $IGNORE_OLDER_THAN); $count{$refr}++; $details{$refr}->{target} = $tgt; my $t = $details{$refr}->{earliest}; if (!defined $t || $when < $t) { $details{$refr}->{earliest} = $when; } $t = $details{$refr}->{latest}; if (!defined $t || $when > $t) { $details{$refr}->{latest} = $when; } } close IN; } my %hist = (); read_history(); my $now = time; my $rssfile = new XML::RSS (version => '1.0'); $rssfile->channel( title => "New referrers for $site", description => "New referrers for $site", link => "http://$site/" ); my $maxitems = 20; my %seen = (); foreach my $refr (sort { $count{$b} <=> $count{$a} or $b cmp $a } keys %count) { next unless $count{$refr} >= $MIN_COUNT; last unless add_item ($count{$refr}, $details{$refr}->{target}, $details{$refr}->{earliest}, $details{$refr}->{latest}, $refr); } $rssfile->{output} = '1.0'; print $rssfile->as_string; write_history(); exit; sub add_item { my ($count, $target, $earliest, $latest, $ref) = @_; my $dom = url_to_core_domain($ref); my $skip = 0; if ($earliest && $earliest > $ALWAYS_LIST_NEWER_THAN) { $skip = 0; } elsif ($hist{$dom}) { $skip = 1; } if ($latest && $latest < $IGNORE_OLDER_THAN) { $skip = 1; } my $isimage = 0; if ($target =~ /\.(?:gif|jpe?g|png|tiff?)(?:\?|\#|$)/i) { $isimage = 1; } if ($isimage && $IGNORE_IMAGE_EMBEDS) { $skip = 1; } return 1 if $skip; # print "$count $skip $dom $target $ref\n"; $earliest = strftime "%Y-%m-%d %H:%M:%S", gmtime $earliest; $latest = strftime "%Y-%m-%d %H:%M:%S", gmtime $latest; $rssfile->add_item( title => $ref, link => $ref, description => qq{

URL: $ref

Linking to: $siteurl$target

Followed $count times between $earliest and $latest.

}); $hist{$dom} = { t => $now, tgt => $target, u => $ref }; return 0 if ($maxitems-- == 0); return 1; } sub read_history { open (IN, ") { /^(\S+) t=(\d+) tgt=(\S+) u=(.*)/ or next; $hist{$1} = { t => $2, tgt => $3, u => $4 }; } close IN; } sub write_history { open (OUT, ">hist") or die "cannot write hist"; foreach my $dom (keys %hist) { my $h = $hist{$dom}; print OUT "$dom t=$h->{t} tgt=$h->{tgt} u=$h->{u}\n"; } close OUT or die "cannot save hist"; } sub url_to_core_domain { my $url = $_[0]; $url =~ s{^[a-z0-9]+://}{}i; # remove protocol $url =~ s{^www\d*\.}{}i; # remove "www." # drop TLD suffixes and "www." where possible; this means we treat # "google.com"/"www.google.co.uk"/"google.ie" as equivalent $url =~ m{^([^/]+?) (?: \.co\.(?:uk|ve|il|jp|id)| \.com\.(?:au|co|sg|pr|bh)| \.[a-z]*)?/}x; if ($1) { return $1; } # otherwise, domain including full TLD $url =~ m{^([^/]+?)/}; if ($1) { return $1; } return $url; } # validator URL for jmason's dev feed: # http://feedvalidator.org/check.cgi?url=http%3A%2F%2Ftaint.org%2Fscraped%2Freferrers-taint.org.xml