#!/usr/bin/perl -w require 5.8.8; use strict; use utf8; use Digest::MD5 qw(md5_base64); use Encode; use Getopt::Long qw(:config bundling); use HTML::Entities; use HTML::FormatText; use HTML::TreeBuilder; use HTML::TreeBuilder::XPath; use HTTP::Daemon; use HTTP::Status; use HTML::Tidy; use LWP::UserAgent; use POSIX qw(locale_h strftime); use XML::RSS; use constant { DEBUG => 0, INFO => 1, WARNING => 2, ERROR => 3, }; our $VERSION = '0.4'; my $NAME = 'F5er'; my $DEBUG = 1; my $TIME_FORMAT = '%y/%m/%d %H:%M:%S'; my $LOG_FORMAT = '<%s> %s'; # Specifies the maximum number of pages to go through while looking for # a last page. my $MAX_PAGE_REDIRECT = 50; my $Config = '/etc/f5er.conf'; my $Host = 'localhost'; my $Port = 8080; my $Help; # Subroutines ################################################################## sub l { my ($type, $message) = @_; my $format_message = sub { my $message = shift; my $time = strftime($TIME_FORMAT, localtime); return sprintf ($LOG_FORMAT, $time, $message); }; $message .= "\n" unless $message =~ /\n$/; if ($type eq DEBUG) { print STDOUT &$format_message($message) if ($DEBUG); } elsif ($type eq INFO) { print STDOUT &$format_message($message); } elsif ($type eq WARNING) { print STDERR &$format_message($message); } elsif ($type eq ERROR) { print STDERR &$format_message($message); exit 1; } else { die "Unknown message type."; } } sub print_usage { my $message = shift; if (defined $message && length $message) { $message .= "\n" unless $message =~ /\n$/; } my $prog_name = $0; $prog_name =~ s#^.*/##; print STDERR ( $message, "Usage: $prog_name [-a HOSTNAME] [-p PORT] [-c CONFIG]\n" . " -a, --address=HOSTNAME Local host bind address (the default " . "value is `$Host').\n" . " -p, --port=NUMBER Local host bind port (the default " . "value is '$Port').\n" . " -c, --config=FILE Configuration file to use (the default " . "value is `$Config').\n" . " -h, --help Print this help message and exits.\n" ); exit 1; } sub parse_config { my $cfg_file_name = shift; my $result; # Check if all mandatory options in the channel are set my $check_completeness = sub { my $id = shift; return unless ($id && $result->{$id}); foreach (qw(title description link selection_xpath)) { unless ($result->{$id}->{$_}) { l WARNING, "Malformed config file: " . "mandatory option `$_' is not specified."; delete $result->{$id}; last; } } }; open (TXT, '<', $cfg_file_name) or l ERROR, "Can not open the config file."; my $channel_id; while (my $line = ) { chop $line; if ($line =~ /^(#.*)?$/) { l DEBUG, "Skipping " . (($1) ? "comment" : "empty line") . '.'; next; } elsif ($line =~ /^\[(\w+)\]$/) { l DEBUG, "New channel found: `$1'."; &$check_completeness($channel_id); $channel_id = $1; $result->{$channel_id} = {}; } elsif ($line =~ /^(\w+)\s?=\s?(.*)$/) { l DEBUG, "New option found: `$1' = `$2'."; unless ($channel_id) { l WARNING, "Malformed config file: " . "option `$1' before channel declaration."; next; } $result->{$channel_id}->{$1} = $2; Encode::_utf8_on($result->{$channel_id}->{$1}); } else { l WARNING, "Malformed config file: " . "can't parse the following line: `$line'."; } } &$check_completeness($channel_id); close(TXT); return $result; } # Downloads whole html page from a given url sub download_html { my ($ua, $url, $form) = @_; my $html; l DEBUG, "Downloading `$url'..."; my $resp = (defined $form && $form) ? $ua->post($url, $form) : $ua->get($url); unless ($resp && $resp->is_success) { l WARNING, "Could not download `$url'."; return undef; } $html = $resp->content; my $from; if ($resp->headers()->{"content-type"} =~ /charset=([\w\-]+)/) { $from = lc $1; } elsif ($html =~ /charset=([\w\-]+)/) { $from = lc $1; } elsif ($html =~ /encoding="([\w\-]+)"/) { $from = lc $1; } else { l WARNING, "Unable to determine encoding of the HTML page."; } if ($from) { Encode::_utf8_off($html); Encode::from_to($html, $from, 'utf-8'); Encode::_utf8_on($html); } my $tidy = HTML::Tidy->new(); # Suppress STDERR manually since the `quiet' option is unsupported open OLDERR, ">&", STDERR; open STDERR, ">/dev/null"; $html = $tidy->clean($html); open STDERR, ">&", OLDERR; return $html; } sub extract_selection { my ($html, $xpath) = @_; my $selection; my $tree = HTML::TreeBuilder::XPath->new; $tree->parse($html); my @nodes; eval { @nodes = @{$tree->findnodes($xpath)}; }; $selection = $selection . $_->as_HTML('<>&"') foreach (@nodes); $tree->delete; return $selection; } sub get_last_ref { my ($ua, $url, $xpath) = @_; l DEBUG, "Fetching reference to the last page..."; my $max_number = 0; my $prev_max_number = -1; (my $domain = $url) =~ s/^(https?:\/\/[^\/]*\/).*/$1/; (my $parent_url = $url) =~ s/^(.*\/)[^\/]*/$1/; unless ($domain && $parent_url) { l WARNING, "Could not extract domain and/or parent URL address."; return undef; } my $ref = $url; my $i = $MAX_PAGE_REDIRECT; while ($max_number > $prev_max_number && $i > 0) { l DEBUG, "Handling page #$max_number ..."; $prev_max_number = $max_number; $i--; my $html = download_html($ua, $ref); return undef unless ($html); my $tree = HTML::TreeBuilder::XPath->new; $tree->parse($html); my $node; eval { $node = @{$tree->findnodes($xpath)}[0]; }; unless ($node) { l WARNING, "Could not find a paginator node on the HTML page."; $tree->delete; return undef; } my @refs = $node->findnodes('./descendant::a'); foreach my $reff (@refs) { if ($reff->as_text() =~ /(\d+)/ && $1 > $max_number) { $max_number = $1; $ref = $reff->attr('href'); unless ($ref =~ /^http/) { $ref = ($ref =~ /^\//) ? $domain . $ref : $parent_url . $ref; } } } $tree->delete; } return $ref; } # Initialization ############################################################### l DEBUG, "$NAME $VERSION started."; GetOptions('help|h' => \$Help, 'config|c=s' => \$Config, 'address|a=s' => \$Host, 'port|p=i' => \$Port) or print_usage("Invalid commmand line options."); print_usage if defined $Help; my $Channels = parse_config ($Config) or l ERROR, "No channels to monitor."; my $UserAgent = LWP::UserAgent->new; my $Formatter = HTML::FormatText->new; my $Daemon = HTTP::Daemon->new(LocalAddr => $Host, LocalPort => $Port, Reuse => 1) or l ERROR, "Can not start HTTP daemon at '$Host:$Port'."; binmode(STDOUT, ":utf8"); setlocale (LC_TIME, 'C'); # Main clycle ################################################################## while (my $c = $Daemon->accept) { my $r = $c->get_request; goto CLEANUP unless ($r); l DEBUG, "Got request: `" . $r->method . ' ' . $r->uri->path . "'."; if ($r->method ne 'GET') { l WARNING, "Forbidden method `" . $r->method . "'."; $c->send_error(RC_FORBIDDEN); goto CLEANUP; } $r->uri->path =~ /^\/(\w+)$/ or do { l WARNING, "Malformed URI `" . $r->uri->path . "'."; $c->send_error(RC_BAD_REQUEST); goto CLEANUP; }; my $id = $1; unless ($Channels->{$id}) { l WARNING, "Unknown channel `$id'."; $c->send_error(RC_NOT_FOUND); goto CLEANUP; } l DEBUG, "Handling channel `" . $Channels->{$id}->{title} . "'."; my $url = $Channels->{$id}->{link}; my $form; if ($Channels->{$id}->{post_form}) { my @pairs = split('&', $Channels->{$id}->{post_form}); foreach my $pair (@pairs) { $pair =~ /^(.*)=(.*)?$/ or do { l WARNING, "Malformed form field: `$pair'."; next; }; $form->{$1} = $2; } } # Go to the last page elsif ($Channels->{$id}->{paginator_xpath}) { $url = get_last_ref($UserAgent, $url, $Channels->{$id}->{paginator_xpath}); unless ($url) { l WARNING, "Could not obtain a reference to the last page."; $c->send_error(RC_INTERNAL_SERVER_ERROR); goto CLEANUP; } } my $html = download_html($UserAgent, $url, $form); unless ($html) { $c->send_error(RC_INTERNAL_SERVER_ERROR); goto CLEANUP; } my $selection = extract_selection($html, $Channels->{$id}->{selection_xpath}); unless ($selection) { l WARNING, "Looks like the markup was changed and the XPath `" . $Channels->{$id}->{selection_xpath} . "' is obsolete."; $c->send_error(RC_INTERNAL_SERVER_ERROR); goto CLEANUP; } my $date = strftime("%a, %d %b %Y %H:%M:%S %z", localtime); my $tree = HTML::TreeBuilder->new->parse($selection); my $summary = substr $Formatter->format($tree), 0, 50; $tree->delete; # Remove leading spaces and newlines $summary =~ s/^\s+//; $summary =~ s/\n//g; my $rss = XML::RSS->new(version => '2.0', encode_output => 0); my @params = (title => $Channels->{$id}->{title}, link => $Channels->{$id}->{link}, description => $Channels->{$id}->{description}, pubDate => $date, generator => "$NAME $VERSION" ); push @params, ttl => $Channels->{$id}->{ttl} if ($Channels->{$id}->{ttl}); $rss->channel(@params); $rss->add_item(title => "Update found: $summary...", link => $url, description => encode_entities($selection, '<>&"'), guid => md5_base64(Encode::encode_utf8($selection))); my $res = HTTP::Response->new(200); $res->header('Content-Type' => 'text/xml'); $res->content(Encode::encode_utf8($rss->as_string)); $c->send_response($res); CLEANUP: $c->close; l DEBUG, "Connection closed."; undef($c); } =pod =head1 NAME F5er - RSS notifier about updates on web pages =head1 SYNOPSIS f5er [B<--address>=I] [B<--port>=I] [B<--config>=F] f5er B<--help> =head1 DESCRIPTION B is a daemon, which allows user to receive notifications about changes in web pages, which do not offer a built-in RSS support. On a user's request, it downloads a predefined web page, extracts target information from the web page content and generates an RSS feed on the basis of the content. The actual determination whether the web page was changed or not is performed by the user's RSS aggregator. So F5er is a universal intermediary between any web site and any RSS aggregator. In order to receive notifications, user needs to add a new feed to his RSS aggregator for each F5er channel defined in the configuration file. F5er's feeds have the following URL format: http://hostname:port/channel_id Where I is a channel ID specified in the channel declaration. See the configuration section for details. =head1 OPTIONS =over 4 =item B<-a> I, B<--address>=I Local host bind address. The default value is C. =item B<-p> I, B<--port>=I Local host bind port. The default value is C<8080>. =item B<-c> F, B<--config>=F Configuration file to use. The default value is F. =item B<-h>, B<--help> Print out usage information. =back =head1 CONFIGURATION Each line of the config file is either a comment or a directive. Comment lines start with '#' and are ignored as well as empty lines. Directives are of two types: channel declarations and parameter assignments. A channel declaration has the following format: [channel_id] The only allowed characters in a channel ID are alphanumeric characters and underscore. All parameters following the channel declaration up to the next declaration or up to the end of file are related to the same channel_id. A parameter assignment contains a parameter name and a parameter value separated by '='. For example: parameter = value The supported parameters are listed below. =over 4 =item B Required. Defines the title of the channel. =item B<description> Required. Describes the channel. =item B<link> Required. Defines the web site URL of the channel. =item B<post_form> Optional. HTML form to send when fetching HTML page of the channel. If set, F5er will send a POST request instead of a GET, which is the default method. =item B<selection_xpath> Required. XPath to the selection to extract. See the L<XPath Tutorial|/"NOTES"> for information on how to compose XPath. =item B<paginator_xpath> Optional. XPath to the paginator on the web page. If set, F5er will try to nagivate to the last page available. In order to be properly handled, paginator should contain numeric links to pages. =item B<ttl> Optional. How often to refresh the feed from the source (in minutes). =back For example the following configuration may be used for monitoring current Perl version: [perl] title = Perl description = Current Perl version link = http://www.perl.org/ selection_xpath = //div[@id='short_lists']/div[1] ttl = 60 =head1 DEPENDENCIES Besides the Perl interpreter and the core modules, F5er requires the following modules: L<HTML::Entities>, L<HTML::FormatText>, L<HTML::TreeBuilder>, L<HTML::TreeBuilder::XPath>, L<HTTP::Daemon>, L<HTTP::Status>, L<HTTP::Tidy>, L<LWP::UserAgent>, L<XML::RSS>. All of them are available on CPAN. =head1 NOTES =over 4 =item 1. L<Comparison of feed aggregators|http://en.wikipedia.org/wiki/Comparison_of_feed_aggregators> =item 2. L<XPath Tutorial|http://www.w3schools.com/xpath/default.asp> =item 3. L<RSS Reference|http://www.w3schools.com/rss/rss_reference.asp> =back =head1 HISTORY =over 4 =item S<15 Nov 2011> - Version 0.4 Fixed memory leak. =item S<24 Oct 2011> - Version 0.3 Added ability to compose channels based on POST requests (using the `post_form' parameter). =item S<10 May 2011> - Version 0.2 Added cleaning up of the HTML content in order to extract nodes from pages with invalid markup. =item S<18 Apr 2011> - Version 0.1 Initial public release. =back =head1 COPYRIGHT AND LICENSE Copyright (C) 2011 Vitaly Minko This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the L<GNU General Public License|http://www.gnu.org/licenses/> for more details. =head1 AUTHOR Vitaly Minko <vitaly.minko@gmail.com> =cut