#!/usr/bin/perl my $site = shift @ARGV; my $google_api_key = shift @ARGV; if (!$google_api_key) { die q{ goog-love.pl - find out where your site's google juice comes from This script will grind through your web site's "access.log" file (which must be in the "combined" log format). It'll pick out the top 100 Google searches found in the referer field, re-run those searches, and determine which ones are giving your website all the linky Google love -- in other words, the searches that your site 'wins' on. The output is in plain text and a chunk of HTML. usage: goog-love.pl sitehost google-api-key < access.log > out.html e.g. cat /var/www/logs/taint.org.* | \ goog-love.pl taint.org kjkdf909403g0fg0f90dfgdf0g09gfd | \ tee out.html NOTE: this script requires the "SOAP::Lite" module be installed. Install it using "apt-get install libsoap-lite-perl" or "cpan SOAP::Lite". It also requires a Google API key. }; } # rev: May 5 2006 jm - added 'sleep 1' between all invocations for same # Apr 8 2006 jm - added retries for "Bad Gateway" goog errors on searches # Mar 7 2006 jm # Feb 7 2006 jm - less commentary # --------------------------------------------------------------------------- use warnings; use strict; use SOAP::Lite; use CGI; # to decode the CGI-formatted query string # cf http://www.oreillynet.com/pub/a/network/excerpt/ggl_hcks/index.html # for the general Google-API idea. Net::Google doesn't work anymore it # seems :( $|=1; print "progress: resolving Google SOAP service...\n"; my $google = SOAP::Lite->service("http://api.google.com/GoogleSearch.wsdl"); # $ua = LWP::UserAgent->new; # $ua->env_proxy; my %queries; $| = 1; readlogs(); google(); summarise(); exit; # --------------------------------------------------------------------------- # "http://www.google.it/search?q=%2B%22Bob+Menschel%22+%2Bspamassassin&hl=it&lr=&client=firefox-a&rls=org.mozilla:en-US:official&start=10&sa=N" # "http://www.google.com/search?q=%22History+of+CD-ROM%22&hl=en&lr=&rls=GGLD,GGLD:2005-09,GGLD:en&start=50&sa=N" # etc. # sub readlogs { print "progress: reading log lines...\n"; while (<>) { /^\d\S+ \S+ \S+ \S+ \S+ "GET (.*?) HTTP\/1.0" \S+ \S+ "(.+?)" "/ or next; my $path = $1; my $referer = $2; my $query; if ($referer =~ /google/i && $referer =~ /[\?\&]q=([^\&]+)(?:\&|$)/) { $query = $1; } if ($query) { add_query($query, $path); } } } sub add_query { my ($query, $path) = @_; $queries{$query} ||= { paths => { }, count => 0 }; $queries{$query}{paths}{$path} = 1; $queries{$query}{count}++; } sub google { my $maxqueries = 100; my $qcount; print "progress: found ".(scalar keys %queries)." searches. ". "checking the top $maxqueries against google...\n"; foreach my $query (sort { $queries{$b}{count} <=> $queries{$a}{count} } keys %queries) { $qcount++; last if ($qcount > $maxqueries); my $cgi = CGI->new('q='.$query); my $qstring = $cgi->param('q'); my $count = $queries{$query}{count}; $queries{$query}{rawqstring} = $query; $queries{$query}{qstring} = $qstring; $queries{$query}{posn} = 999999; # ie. "crappy" # print "googling: [$qstring] ($count hits)\n"; # frequent sleeps to appease the angry Google API god; # forgive me oh great one! sleep 1; # protect against die(), and retry, due to (frequent!) # '502 Bad Gateway' errors otherwise. boo google my $results; for my $retry (1 .. 10) { eval { sleep 1; # not too frequently, I think that triggers # the "502 Bad Gateway" and "500 Internal Server Error" # messages that google otherwise sends half the time. $results = $google->doGoogleSearch( $google_api_key, $qstring, 0, 10, "false", "", "false", "", "latin1", "latin1" ); }; if ($@) { warn "google search failed! $@ Retrying, try $retry..."; sleep 4; next; } # else last; } ($results && @{$results->{'resultElements'}}) or next; my $posn = 0; foreach my $result (@{$results->{'resultElements'}}) { $posn++; # print $result->{URL}, $result->{snippet},"\n"; if ($result->{URL} !~ /${site}/i) { next; } $queries{$query}{posn} = $posn; my $url = $queries{$query}{url} = $result->{URL}; my $totresults = $queries{$query}{totalresults} = $results->{'estimatedTotalResultsCount'}; my $l = "#$posn of $totresults for [$qstring], with $count queries: $url\n"; print "progress: \@$qcount - $l"; last; } } } sub summarise { my $otext = ''; my $ohtml = ''; foreach my $query (sort { $queries{$a}{posn} <=> $queries{$b}{posn} or $queries{$b}{totalresults} <=> $queries{$a}{totalresults} or length($queries{$a}{qstring}) <=> length($queries{$b}{qstring}) } grep { $queries{$_}{posn} && ($queries{$_}{posn} < 999999) && $queries{$_}{qstring} && $queries{$_}{url} } keys %queries) { my $url = $queries{$query}{url}; my $posn = $queries{$query}{posn}; my $qstring = $queries{$query}{qstring}; my $count = $queries{$query}{count}; my $rawqstring = $queries{$query}{rawqstring}; my $totresults = $queries{$query}{totalresults}; my $qhref = 'http://www.google.com/search?num=10&q='.$rawqstring; my $l = "#$posn of $totresults for [$qstring], with $count queries: $url\n"; $otext .= $l; $ohtml .= qq{
  • #$posn of $totresults for [$qstring], with $count queries: $url
  • }; } print qq{ }; }