# Blosxom Plugin: autotrack -*- cperl -*- # Author: Todd Larason (jtl@molehill.org) # Version: 0+2i # Blosxom Home/Docs/Licensing: http://blosxom.sourceforge.net/ # AutoTrack plugin Home/Docs/Licensing: # http://molelog.molehill.org/blox/Computers/Internet/Web/Blosxom/AutoTrack/ package autotrack; # -------------- Configuration Variables -------------- # regular expression matching URLs not to trackback # there's no reason to try google or amazon, and most people don't # want to trackback their own stories $dont_tb_re = qr!(?:http://(?: (?: www\.google\.com ) | (?: www\.amazon\.com ) ) ) | (?: $blosxom::url )!ix unless defined $dont_tb_re; # what to do if the timestamp file doesn't exist? if $start_from_now is # set, autotrack future stories but not old ones $start_from_now = 1 unless defined $start_from_now; # how automatic? if $semi_auto is set, only send trackbacks if ?autotrack=yes # otheriwse, fully automatic $semi_auto = 1 unless defined $semi_auto; # networking implementation to be used # can be 'LWP', 'curl' or 'wget' (or a full pathname to a curl or wget # executable) # wget must be at least a 1.9 beta to support the --post-data option $networking = "LWP" unless defined $networking; $debug_level = 1 unless defined $debug_level; # ----------------------------------------------------- # template-visible vars use CGI qw/param/; use HTML::Entities; use File::stat; use strict; use vars qw/$dont_tb_re $start_from_now $semi_auto $networking $debug_level/; my $dont_really_ping = 0; my $package = "autotrack"; my $timestamp_file = "$blosxom::plugin_state_dir/.$package.timestamp"; my $last_timestamp; my $files; sub debug { my ($level, @msg) = @_; if ($debug_level >= $level) { print STDERR "$package debug $level: @msg\n"; } } # utility funcs sub url_escape { local ($_) = @_; s/([^a-zA-Z0-9])/sprintf("%%%02x",ord($1))/eg; s/%20/+/g; return $_; } sub GET { my ($url) = @_; if ($networking =~ m:curl:) { return `$networking -m 30 -s $url`; } elsif ($networking =~ m:wget:) { return `$networking --quiet -O - $url`; } elsif ($networking eq 'LWP') { foreach (qw/LWP::UserAgent HTTP::Request::Common/) { eval "require $_"; if ($@) { debug(0, "Can't load $_, can't use LWP networking"); return undef; } } my $ua = LWP::UserAgent->new; my $res = $ua->request(HTTP::Request::Common::GET $url); if (!$res->is_success) { my $error = $res->status_line; debug(0, "HTTP GET error: $error"); return undef; } return $res->content; } else { debug(0, "ERROR: invalid \$networking $networking"); } } sub POST { my ($url, %vars) = @_; if ($networking =~ m:curl:) { my $command = "$networking -m 30 -s "; $command .= join ' ', map {my $v = url_escape($vars{$_}); "-d $_=$v";} keys %vars; $command .= " $url"; debug(2, "Posting with :$command:"); return `$command` unless $dont_really_ping; return "0"; # for testing } elsif ($networking =~ m:wget:) { my $command = "$networking --quiet -O - --post-data='"; $command .= join '&', map {my $v = url_escape($vars{$_}); "$_=$v"} keys %vars; $command .= "' $url"; debug(2, "Posting with :$command:"); return `$command` unless $dont_really_ping; return "0"; # for testing } elsif ($networking eq 'LWP') { foreach (qw/LWP::UserAgent HTTP::Request::Common/) { eval "require $_"; if ($@) { debug(0, "Can't load $_, can't use LWP networking"); return undef; } } my $ua = LWP::UserAgent->new; my $res = $ua->request(HTTP::Request::Common::POST($url, [%vars])); if (!$res->is_success) { my $error = $res->status_line; debug(0, "HTTP POST error: $error"); return undef; } return $res->content; } else { debug(0, "ERROR: invalid \$networking $networking"); } } sub get_trackback_url { my ($url) = @_; return undef if ($url =~ m:$dont_tb_re:); my $text = GET($url); return undef if (!defined($text)); while ($text =~ m!()!msg) { my $rdf = $1; my ($id) = ($rdf =~ m!dc:identifier="([^\"]+)"!); next unless ($id eq $url); my ($tb) = ($rdf =~ m!trackback:ping="([^\"]+)"!); return $tb if defined $tb; } if ($url =~ m:(.*)\#:) { $url = $1; while ($text =~ m!()!msg) { my $rdf = $1; # XXX is this good enough? Can't the namespace IDs be different? # the sample code in the spec @ # http://www.movabletype.org/docs/mttrackback.html # does it this way my ($id) = ($rdf =~ m!dc:identifier="([^\"]+)"!); next unless ($id eq $url); my ($tb) = ($rdf =~ m!trackback:ping="([^\"]+)"!); return $tb if defined $tb; } } debug(2, "Couldn't find tb url for $url"); return undef; } sub ping_trackback { my ($tb_url, $title, $excerpt, $url) = @_; my $txt = POST($tb_url, title => $title, url => $url, excerpt => $excerpt, blog_name => $blosxom::blog_title); debug(3, "Response:$txt:"); if ($txt =~ m:(.*?):ms) { if ($1) { my $errcode = $1; $txt =~ m:(.*):ms; debug(0, "Error $errcode ($1) pinging $tb_url"); return 0; } return 1; } debug(0, "Malformed response while pinging $tb_url"); return 1; } sub make_excerpt { my ($story) = @_; # XXX options to use plaintext or foreshortened plugins $story =~ s:<.+?>::msg; $story = substr($story, 0, 255); return $story; } # plugin funcs sub start { return 0 unless $networking; return 0 if ($semi_auto && !param('autotrack')); # XXX there are at least two different race conditions here # 1: two instances running at the same time could both see an old # timestamp # 2: an instance run at the same time a new story is published could # (at least if there's any clock skew at all) create a timestamp # file >= timestamp(story), for a story that isn't seen. $last_timestamp = -e $timestamp_file ? stat($timestamp_file)->mtime : ($start_from_now ? time : 0); my $fh = new FileHandle; if (!$fh->open("> $timestamp_file")) { debug(0, "Couldn't touch timestamp file $timestamp_file"); return 0; } $fh->close; debug(1, "autotrack enabled"); return 1; } sub filter { my ($pkg, $files_ref) = @_; $files = $files_ref; return 1; } sub story { my ($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_; my ($pathname) = "$blosxom::datadir$path/$filename.$blosxom::file_extension"; return 1 if ($files->{$pathname} < $last_timestamp); my (%pinged, $ping_tries, $ping_succeed); my $excerpt = make_excerpt($$body_ref); my $url = "$blosxom::url$path/$filename.writeback"; defined $meta::tb_ping and ++$ping_tries and ping_trackback($meta::tb_ping, $$title_ref, $excerpt, $url) and ++$ping_succeed and ++$pinged{$meta::tb_ping}; return 1 if (defined $meta::autotrack && $meta::autotrack eq 'no'); while ($$body_ref =~ m!]* href=(?: (http://[^ ]+) | "(http://[^\"]+)" )!msxg) { my $trackback_url = get_trackback_url(decode_entities($+)); next unless defined $trackback_url; next if $pinged{$trackback_url}; $ping_tries++; ping_trackback($trackback_url, $$title_ref, $excerpt, $url) and ++$ping_succeed and ++$pinged{trackback_url}; debug(1, "autotracked: $trackback_url"); } # XXX what do we do if some succeed and some fail? # If we tried some but they all failed, revert the timestamp to # try again later if ($ping_succeed == 0 && $ping_tries > 0) { debug(0, "All pings failed, reverting timestamp"); utime($last_timestamp, $last_timestamp, $timestamp_file); } 1; } 1; =head1 NAME Blosxom Plug-in: autotrack =head1 SYNOPSIS Automatically or semi-automatically sends trackback pings for new stories. =head1 VERSION 0+2i 2nd test release =head1 AUTHOR Todd Larason http://molelog.molehill.org/ This plugin is now maintained by the Blosxom Sourceforge Team, . =head1 BUGS None known; please send bug reports and feedback to the Blosxom development mailing list . =head1 Trackback Ping URL Discovery Trackback Ping URLs are discovered two different ways. =head2 Manual Ping URLs If you have the meta plugin installed, and have it set to run prior to the autotrack plugin, you can give a trackback url with the "meta-tb_ping" header; the value of the header should be the ping URL to ping. =head2 Automatic Ping URL detection Subject to some exceptions explained below, every URL given in an 'href' in the story is fetched, and the resulting content is searched for embedded RDF sections giving trackback URLs for the given URL. This is the preferred way for all tools to be given trackback URLs, as it requires no human intervention, but unfortunately not everyone which has a trackback server includes the appropriate RDF. Even more unfortunately, there's no easy way to know whether it's included or not, other than examining the source of the page. It's always safe to give a meta-tb_ping header; if you give one, and the same ping URL is found by autodiscovery, it's only pinged once. If you don't want autodiscovery to be used for a given story, you can set the meta header 'meta-autotrack' to 'no'. If "meta-autotrack: no" is given, the meta-tb_ping URL is still pinged if it's specified. =head1 Customization =head2 Configuration Variables C<$dont_tb_re> is a regular expression agains which URLs are matched; if it matches, the URL isn't fetched for autodiscovery; this is useful for classes of URLs that you link to frequently that you know don't include the autodiscovery RDF, or that you don't wish to be pinged. The default value matches Amazon and Google URLs, as well as references to the current weblog. C<$start_from_now> is a boolean that controls the behavior if the timestamp file doesn't exist; if it's true, then it's treated as if it does exist, with the current time -- no old articles are pinged. If it's false, then every story seen is treated as new. Defaults to true. C<$semi_auto> is a boolean controlling how automatic the pinging is. If it's false, then the plugin acts in fully automatic mode -- it's always enabled, and any new story is examined. If it's true, then the plugin acts in semi-automatic mode -- it's only enabled if the URL being browsed includes the paramater "autotrack" (ie, ends with "?autotrack=yes"). By default, this is true. C<$networking> controls which networking implementation to use. If set to "LWP", an implementation which uses the common LWP (libwww-for-perl) perl module set is used; if set to a string that includes the word 'curl', an implementation which uses the external 'curl' utility is used, and the value of $networking is used as the beginning of the command line (this can be used to specify a full path to curl or to pass additional arguments); if set to a string which includes the word 'wget', an implementation which uses the external 'wget' utility is used with $networking used at the beginning of the command line as with curl. The wget executable must be new enough to include the --post-data option; currently, that means a recent 1.9 beta. Defaults to "LWP". C<$debug_level> is an int from 0 to 5 controlling how much debugging output is logged; 0 logs only errors. Defaults to 1. =head2 CSS and Flavour Files There is no output, so no customization through these methods. =head1 Timestamp A timestamp file is kept as $plugin_state_dir/.autotrack.timestamp; stories are considered 'new' if their timestamp is later than the timestamp file (see the C<$start_from_now> variable for the behavior if the file doesn't exist). There is a small race condition between reading the timestamp file and updating it when the plugin is enabled; one advantage of semi- automatic mode is that this is rarely a problem, since the plugin is only enabled when you want it to be. If trackback pings are attempted but they all fail, the timestamp file is reverted to its previous value, so the pings will be tried again later. if some pings succeed and others fail, however, the timestamp is left with the updated values, and the failed pings won't be retried. =head1 THANKS * Rael Dornfest -- blosxom (of course) and suggesting $start_from_now option * Taper Wickel -- pointing out wget 1.9's post support =head1 LICENSE this Blosxom Plug-in Copyright 2003, Todd Larason (This license is the same as Blosxom's) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. =cut