# Blosxom Plugin: autotrack -*- cperl -*-
# Author: Todd Larason (jtl@molehill.org)
# Version: 0+2i
# Blosxom Home/Docs/Licensing: http://www.raelity.org/blosxom
# AutoTrack plugin Home/Docs/Licensing:
# http://molelog.molehill.org/blox/Computers/Internet/Web/Blosxom/AutoTrack/
package autotrack;
# -------------- Configuration Variables --------------
# regular expression matching URLs not to trackback
# there's no reason to try google or amazon, and most people don't
# want to trackback their own stories
$dont_tb_re = qr!(?:http://(?:
(?: www\.google\.com ) |
(?: www\.amazon\.com )
)
) |
(?: $blosxom::url )!ix
unless defined $dont_tb_re;
# what to do if the timestamp file doesn't exist? if $start_from_now is
# set, autotrack future stories but not old ones
$start_from_now = 1 unless defined $start_from_now;
# how automatic? if $semi_auto is set, only send trackbacks if ?autotrack=yes
# otheriwse, fully automatic
$semi_auto = 1 unless defined $semi_auto;
# networking implementation to be used
# can be 'LWP', 'curl' or 'wget' (or a full pathname to a curl or wget
# executable)
# wget must be at least a 1.9 beta to support the --post-data option
$networking = "LWP" unless defined $networking;
$debug_level = 1
unless defined $debug_level;
# -----------------------------------------------------
# template-visible vars
use CGI qw/param/;
use HTML::Entities;
use File::stat;
use strict;
use vars qw/$dont_tb_re $start_from_now $semi_auto $networking $debug_level/;
my $dont_really_ping = 0;
my $package = "autotrack";
my $timestamp_file = "$blosxom::plugin_state_dir/.$package.timestamp";
my $last_timestamp;
my $files;
sub debug {
my ($level, @msg) = @_;
if ($debug_level >= $level) {
print STDERR "$package debug $level: @msg\n";
}
}
# utility funcs
sub url_escape {
local ($_) = @_;
s/([^a-zA-Z0-9])/sprintf("%%%02x",ord($1))/eg;
s/%20/+/g;
return $_;
}
sub GET {
my ($url) = @_;
if ($networking =~ m:curl:) {
return `$networking -m 30 -s $url`;
} elsif ($networking =~ m:wget:) {
return `$networking --quiet -O - $url`;
} elsif ($networking eq 'LWP') {
foreach (qw/LWP::UserAgent HTTP::Request::Common/) {
eval "require $_";
if ($@) {
debug(0, "Can't load $_, can't use LWP networking");
return undef;
}
}
my $ua = LWP::UserAgent->new;
my $res = $ua->request(HTTP::Request::Common::GET $url);
if (!$res->is_success) {
my $error = $res->status_line;
debug(0, "HTTP GET error: $error");
return undef;
}
return $res->content;
} else {
debug(0, "ERROR: invalid \$networking $networking");
}
}
sub POST {
my ($url, %vars) = @_;
if ($networking =~ m:curl:) {
my $command = "$networking -m 30 -s ";
$command .= join ' ',
map {my $v = url_escape($vars{$_}); "-d $_=$v";} keys %vars;
$command .= " $url";
debug(2, "Posting with :$command:");
return `$command`
unless $dont_really_ping;
return "0"; # for testing
} elsif ($networking =~ m:wget:) {
my $command = "$networking --quiet -O - --post-data='";
$command .= join '&',
map {my $v = url_escape($vars{$_}); "$_=$v"} keys %vars;
$command .= "' $url";
debug(2, "Posting with :$command:");
return `$command`
unless $dont_really_ping;
return "0"; # for testing
} elsif ($networking eq 'LWP') {
foreach (qw/LWP::UserAgent HTTP::Request::Common/) {
eval "require $_";
if ($@) {
debug(0, "Can't load $_, can't use LWP networking");
return undef;
}
}
my $ua = LWP::UserAgent->new;
my $res = $ua->request(HTTP::Request::Common::POST($url, [%vars]));
if (!$res->is_success) {
my $error = $res->status_line;
debug(0, "HTTP POST error: $error");
return undef;
}
return $res->content;
} else {
debug(0, "ERROR: invalid \$networking $networking");
}
}
sub get_trackback_url {
my ($url) = @_;
return undef if ($url =~ m:$dont_tb_re:);
my $text = GET($url);
return undef if (!defined($text));
while ($text =~ m!()!msg) {
my $rdf = $1;
my ($id) = ($rdf =~ m!dc:identifier="([^\"]+)"!);
next unless ($id eq $url);
my ($tb) = ($rdf =~ m!trackback:ping="([^\"]+)"!);
return $tb if defined $tb;
}
if ($url =~ m:(.*)\#:) {
$url = $1;
while ($text =~ m!()!msg) {
my $rdf = $1;
# XXX is this good enough? Can't the namespace IDs be different?
# the sample code in the spec @
# http://www.movabletype.org/docs/mttrackback.html
# does it this way
my ($id) = ($rdf =~ m!dc:identifier="([^\"]+)"!);
next unless ($id eq $url);
my ($tb) = ($rdf =~ m!trackback:ping="([^\"]+)"!);
return $tb if defined $tb;
}
}
debug(2, "Couldn't find tb url for $url");
return undef;
}
sub ping_trackback {
my ($tb_url, $title, $excerpt, $url) = @_;
my $txt = POST($tb_url, title => $title, url => $url, excerpt => $excerpt,
blog_name => $blosxom::blog_title);
debug(3, "Response:$txt:");
if ($txt =~ m:(.*?):ms) {
if ($1) {
my $errcode = $1;
$txt =~ m:(.*):ms;
debug(0, "Error $errcode ($1) pinging $tb_url");
return 0;
}
return 1;
}
debug(0, "Malformed response while pinging $tb_url");
return 1;
}
sub make_excerpt {
my ($story) = @_;
# XXX options to use plaintext or foreshortened plugins
$story =~ s:<.+?>::msg;
$story = substr($story, 0, 255);
return $story;
}
# plugin funcs
sub start {
return 0 unless $networking;
return 0 if ($semi_auto && !param('autotrack'));
# XXX there are at least two different race conditions here
# 1: two instances running at the same time could both see an old
# timestamp
# 2: an instance run at the same time a new story is published could
# (at least if there's any clock skew at all) create a timestamp
# file >= timestamp(story), for a story that isn't seen.
$last_timestamp = -e $timestamp_file ? stat($timestamp_file)->mtime :
($start_from_now ? time : 0);
my $fh = new FileHandle;
if (!$fh->open("> $timestamp_file")) {
debug(0, "Couldn't touch timestamp file $timestamp_file");
return 0;
}
$fh->close;
debug(1, "autotrack enabled");
return 1;
}
sub filter {
my ($pkg, $files_ref) = @_;
$files = $files_ref;
return 1;
}
sub story {
my ($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
my ($pathname) = "$blosxom::datadir$path/$filename.$blosxom::file_extension";
return 1 if ($files->{$pathname} < $last_timestamp);
my (%pinged, $ping_tries, $ping_succeed);
my $excerpt = make_excerpt($$body_ref);
my $url = "$blosxom::url$path/$filename.writeback";
defined $meta::tb_ping and
++$ping_tries and
ping_trackback($meta::tb_ping, $$title_ref, $excerpt, $url) and
++$ping_succeed and
++$pinged{$meta::tb_ping};
return 1 if (defined $meta::autotrack && $meta::autotrack eq 'no');
while ($$body_ref =~
m!]*
href=(?:
(http://[^ ]+) |
"(http://[^\"]+)"
)!msxg) {
my $trackback_url = get_trackback_url(decode_entities($+));
next unless defined $trackback_url;
next if $pinged{$trackback_url};
$ping_tries++;
ping_trackback($trackback_url, $$title_ref, $excerpt, $url) and
++$ping_succeed and
++$pinged{trackback_url};
debug(1, "autotracked: $trackback_url");
}
# XXX what do we do if some succeed and some fail?
# If we tried some but they all failed, revert the timestamp to
# try again later
if ($ping_succeed == 0 && $ping_tries > 0) {
debug(0, "All pings failed, reverting timestamp");
utime($last_timestamp, $last_timestamp, $timestamp_file);
}
1;
}
1;
=head1 NAME
Blosxom Plug-in: autotrack
=head1 SYNOPSIS
Automatically or semi-automatically sends trackback pings for new stories.
=head1 VERSION
0+2i
2nd test release
=head1 AUTHOR
Todd Larason http://molelog.molehill.org/
=head1 BUGS
None known; address bug reports and comments to me or to the Blosxom
mailing list [http://www.yahoogroups.com/groups.blosxom].
=head1 Trackback Ping URL Discovery
Trackback Ping URLs are discovered two different ways.
=head2 Manual Ping URLs
If you have the meta plugin installed, and have it set to run prior to the
autotrack plugin, you can give a trackback url with the "meta-tb_ping"
header; the value of the header should be the ping URL to ping.
=head2 Automatic Ping URL detection
Subject to some exceptions explained below, every URL given in an 'href' in
the story is fetched, and the resulting content is searched for embedded RDF
sections giving trackback URLs for the given URL. This is the preferred way
for all tools to be given trackback URLs, as it requires no human
intervention, but unfortunately not everyone which has a trackback server
includes the appropriate RDF. Even more unfortunately, there's no easy
way to know whether it's included or not, other than examining the source
of the page.
It's always safe to give a meta-tb_ping header; if you give one, and the
same ping URL is found by autodiscovery, it's only pinged once.
If you don't want autodiscovery to be used for a given story, you can set
the meta header 'meta-autotrack' to 'no'. If "meta-autotrack: no" is given,
the meta-tb_ping URL is still pinged if it's specified.
=head1 Customization
=head2 Configuration Variables
C<$dont_tb_re> is a regular expression agains which URLs are matched;
if it matches, the URL isn't fetched for autodiscovery; this is useful
for classes of URLs that you link to frequently that you know don't
include the autodiscovery RDF, or that you don't wish to be pinged. The
default value matches Amazon and Google URLs, as well as references to
the current weblog.
C<$start_from_now> is a boolean that controls the behavior if the timestamp
file doesn't exist; if it's true, then it's treated as if it does exist,
with the current time -- no old articles are pinged. If it's false, then
every story seen is treated as new. Defaults to true.
C<$semi_auto> is a boolean controlling how automatic the pinging is. If
it's false, then the plugin acts in fully automatic mode -- it's always
enabled, and any new story is examined. If it's true, then the plugin
acts in semi-automatic mode -- it's only enabled if the URL being browsed
includes the paramater "autotrack" (ie, ends with "?autotrack=yes"). By
default, this is true.
C<$networking> controls which networking implementation to use. If set to
"LWP", an implementation which uses the common LWP (libwww-for-perl) perl
module set is used; if set to a string that includes the word 'curl', an
implementation which uses the external 'curl' utility is used, and the value
of $networking is used as the beginning of the command line (this can be used
to specify a full path to curl or to pass additional arguments); if set
to a string which includes the word 'wget', an implementation which uses the
external 'wget' utility is used with $networking used at the beginning of
the command line as with curl. The wget executable must be new enough to
include the --post-data option; currently, that means a recent 1.9 beta.
Defaults to "LWP".
C<$debug_level> is an int from 0 to 5 controlling how much debugging output
is logged; 0 logs only errors. Defaults to 1.
=head2 CSS and Flavour Files
There is no output, so no customization through these methods.
=head1 Timestamp
A timestamp file is kept as $plugin_state_dir/.autotrack.timestamp; stories
are considered 'new' if their timestamp is later than the timestamp file
(see the C<$start_from_now> variable for the behavior if the file doesn't
exist). There is a small race condition between reading the timestamp
file and updating it when the plugin is enabled; one advantage of semi-
automatic mode is that this is rarely a problem, since the plugin is only
enabled when you want it to be.
If trackback pings are attempted but they all fail, the timestamp file is
reverted to its previous value, so the pings will be tried again later. if
some pings succeed and others fail, however, the timestamp is left with the
updated values, and the failed pings won't be retried.
=head1 THANKS
* Rael Dornfest -- blosxom (of course) and suggesting $start_from_now option
* Taper Wickel -- pointing out wget 1.9's post support
=head1 LICENSE
this Blosxom Plug-in
Copyright 2003, Todd Larason
(This license is the same as Blosxom's)
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
=cut