#!/usr/bin/perl -w =head1 NAME vbulletin2mail - gate from a vBulletin web board to email =head1 SYNOPSIS [edit script to change settings] vbulletin2mail recipient@email.address =head1 NOTE Requires 'lynx'. =head1 VERSION Dec 2 2001 jm =cut use vars qw($FROM $SUBJ_TAG $THREAD_BASE $FORUM $MSGID_BASE $SECS_BETWEEN_THREADS $SECS_FOR_MAIL_DELIVERY $DEBUG $MY_URL); # ------------------------------------------------------------------------- # Edit these values to suit the board you want to scrape: # The forum and thread-showing URLs to read. $FORUM = 'http://www.boards.ie/vbulletin/forumdisplay.php?forumid=60'; $THREAD_BASE = 'http://www.boards.ie/vbulletin/showthread.php?'; # A tag to add to subject lines. $SUBJ_TAG = '[IOForum] '; # Address the mails appear to come from. $FROM = 'vbulletin2mail@taint.org'; # You probably won't need to change anything from here on in... # ------------------------------------------------------------------------- # used to make a valid message-Id $MSGID_BASE = '.vbulletin2mail@taint.org'; # sleep times, to avoid hammering the server $SECS_BETWEEN_THREADS = 5; $SECS_FOR_MAIL_DELIVERY = 2; # set debug=1 to load from files called "forum.txt" and "thread.txt", # and never send mail. $DEBUG = 0; # ------------------------------------------------------------------------- # URL about this script $MY_URL = 'http://jmason.org/software/'; BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); } use AnyDBM_File; use Fcntl; use Time::Local; use POSIX qw(strftime); use strict; use vars qw(%ALREADY_SEEN); my $numthreads = 0; if ($ARGV[0] eq '-n') { shift; $numthreads = shift (@ARGV) + 0; } my $TO = $ARGV[0]; if (!defined $TO) { die "usage: vbulletin2mail recipient\@email.address\n"; } doforum ($FORUM); # ------------------------------------------------------------------------- sub doforum { my $forum = shift; $forum .= '&sortfield=lastpost&daysprune=1'; print "Getting forum index: $forum\n"; if ($DEBUG) { open (IN, "< forum.txt"); } else { open (IN, "lynx -dump '$forum' |"); } my $today = strftime ("%d-%m-%Y", gmtime time); my %threads = (); my %linkstofollow = (); my %postdate = (); my $postdate = undef; my $howmany = 0; while () { # look for the 'last post' dates and keep only the ones from today if (/ (${today} \d\d:\d\d ..)\s*$/o || / (${today} \d\d:\d\d)\s*$/o) { $postdate = $1; } if (/ \[(\d+)\] Go to last post\s*$/) { if (defined $postdate || $howmany < $numthreads) { $linkstofollow{$1} = 1; $postdate{$1} = $postdate; $postdate = undef; } } if (/^ *(\d+)\. http:\/\/.*\&threadid=(\d+)/) { if (defined $linkstofollow{$1}) { $threads{$2} = 1; $howmany++; print "Will read thread $2, last post: ". (defined $postdate{$1} ? $postdate{$1} : "(not today)"). "\n"; } } } close IN; # read in reverse order; oldest first. my @threads = (reverse sort keys %threads); my $count = 1; foreach my $thread (@threads) { opendb ($FORUM, $TO); dothread ($forum, $thread); closedb (); # sync up warn "Finished checking thread $count of ".($#threads+1). ". Sleeping for $SECS_BETWEEN_THREADS seconds...\n"; sleep $SECS_BETWEEN_THREADS; $count++; } } # ------------------------------------------------------------------------- sub dothread { my $forum = shift; my $thread = shift; my $text = ''; my %url = (); my $threadurl = $THREAD_BASE."threadid=$thread\&perpage=999"; print "Thread: $threadurl\n"; if ($DEBUG) { open (IN, "< thread.txt"); } else { open (IN, "lynx -dump '$threadurl' |"); } while () { s/ +/ /gs; $text .= $_; /^ *(\d+)\. (http:\/\/.*$)/ and $url{$1} = $2; } close IN; my $thread_subject = ''; $text =~ / : Powered by vBulletin version \d+.*?\s+>\s+([^>]+?)\n\n\s+\[\d+\]Last Thread\s+\[\d+\]Next Thread\n\n/s and $thread_subject = $1; $thread_subject =~ s/\s+/ /gs; $thread_subject =~ s/ Thread Rating: \d+.*$//g; $text =~ s/^.*? Author\s+Thread .*?Post A Reply//gs or warn "Author/Thread/Post New Thread/Post A Reply not found"; $text =~ s/\n All times are .*?The time now is .*?Post New Thread.*?Post A Reply.*$//gs or warn "All times are.../Post New Thread/Post A Reply not found"; my $key = 0; my %mails_to_send = (); while ($text =~ s/^\s+(\S.*?\S)\s+\[\d+\]Edit\/Delete\s+Message\s*\[\d+\]Reply\s+w\/Quote//s) { my $msg = $1; my $date = ''; my $poster = ''; my $subject = ''; my $body = ''; # Date could be in 24 hour or AM/PM format $msg =~ /Post (\d\d-\d\d-\d\d\d\d \d\d:\d\d(?: ..|))/ and $date = $1; $msg =~ /^(\S[^\n]*?)\n/s and $poster = $1; # find the end of the message, and replace it with a token. $msg =~ s{\s+\[\d+\]Link\s+ \|\s+\[\d+\]Report\s+this\s+post\s+to\s+a\s+mod(?:erator|)\s+ \|\s+IP:\s+\[\d+\]Logged}{__MeSsAgE_EnD__}gxs; $msg =~ s{\s+\[\d+\]Report\s+this\s+post\s+to\s+a\s+mod(?:erator|)\s+ \|\s+IP:\s+\[\d+\]Logged}{__MeSsAgE_EnD__}gxs; $msg =~ s{\s+\[\d+\]Link\s+ \|\s+\[\d+\]Report\s+ \|\s+IP:\s+\[\d+\]Logged}{__MeSsAgE_EnD__}gxs; # trim crap from start of msg $msg =~ s{\s+Posts: \d+ \(\S+ per day\)\s*}{\n}gs; $msg =~ s{\s+Posts: \d+\s*}{\n}gs; $msg =~ s{\nC?Mod: [^\n]*\s*\n}{\n}gs; $msg =~ s{\nLast edited by \S+ on[^\n]+\n}{\n}gs; my $search = $msg; if ($msg =~ /Registered: [^\n]*?\n([^\n]*?)\n *\n\s+(.*?)__MeSsAgE_EnD__/s || $msg =~ /Registered: [^\n]*?\n()(.*?)__MeSsAgE_EnD__/s) { $subject = $1; $body = $2; } if ($date !~ /\S/ || $poster !~ /\S/ || $body !~ /\S/) { warn "Skipping message, date='$date' poster='$poster' body:\n"; print "\n'$body'\nsearch: '$search'\nfull: '$msg'\n\n"; next; } $body =~ s/^\n+//gs; $body =~ s/\n /\n/gs; $body =~ s/\s+$//gs; my $urls = ''; foreach my $ref ($body =~ /\[(\d+)\]/g) { $urls .= " $ref\. $url{$ref}\n"; } if ($urls ne '') { $body .= "\n\nReferences:\n\n$urls"; } if ($subject eq '') { $subject = $thread_subject; } $poster =~ s/[\0 \t\n\)\<\>]+/ /gs; $subject =~ s/\s+/ /gs; my $timet; if ($date =~ /^(\d+)-(\d+)-(\d+) (\d+):(\d+) (A|P)M/) { $timet = timegm (0, $5, $4 + ($6 eq 'P' ? 12 : 0), $1, $2-1, $3); } elsif ($date =~ /^(\d+)-(\d+)-(\d+) (\d+):(\d+)$/) { $timet = timegm (0, $5, $4, $1, $2-1, $3); } else { $timet = time; } my $maildate = strftime ("%a, %e %b %Y %H:%M:%S +0000", gmtime $timet); # print "summary date=$maildate poster=$poster subj=$subject\n"; my $msgid = "$poster.$date.$thread"; $msgid =~ s/[^-\_A-Za-z0-9\.\,\=]/_/gs; $msgid = "<$msgid$MSGID_BASE>"; if (already_seen ($msgid)) { next; } else { my $mail = < $poster, subject => $subject, mail => $mail, msgid => $msgid }; $key++; } } # send in order; oldest first. foreach my $key (sort keys %mails_to_send) { my $obj = $mails_to_send{$key}; print "unseen: [$obj->{poster}] $obj->{subject}\n"; send_mail ($obj->{mail}, $obj->{msgid}); } } # ------------------------------------------------------------------------- sub send_mail { my $mail = shift; my $msgid = shift; if (!$DEBUG) { open (SM, "| /usr/lib/sendmail -oi -t") or die "sendmail"; print SM $mail; close SM or warn "sendmail failed"; sleep $SECS_FOR_MAIL_DELIVERY; } mark_seen ($msgid); } # ------------------------------------------------------------------------- sub already_seen { my $msgid = shift; return $ALREADY_SEEN{$msgid}; } sub mark_seen { my $msgid = shift; $ALREADY_SEEN{$msgid} = 1; } # ------------------------------------------------------------------------- sub opendb { my $forum = shift; my $to = shift; my $dir = $ENV{'HOME'}."/.vbulletin2mail"; if (!-d $dir) { mkdir ($dir, 0755); } $forum =~ s/[^a-z0-9]+/_/gis; $to =~ s/[^a-z0-9]+/_/gis; my $fname = $dir."/".$forum.".".$to; tie (%ALREADY_SEEN, 'AnyDBM_File', $fname, O_CREAT|O_RDWR, 0644) or die "cannot open/create already-seen db $fname"; } sub closedb { untie (%ALREADY_SEEN) or warn "untie failed"; } # -------------------------------------------------------------------------