#!/usr/bin/perl -w
# This script takes the URLs of several RSS feeds and creates two things:
# 1) A single RSS 1.0 feed combining all (or some of) the items.
# 2) An HTML file of all (or some of) the items.
# v1.0 2004-12-31
# Phil Gyford, phil@gyford.com
use strict;
use LWP::Simple;
use XML::RSS;
use Date::Manip qw(ParseDate UnixDate);
use POSIX qw(ceil);
######################################################
# CONFIG
# The URLs of each feed we're going to fetch.
# For each key in the hash there should be a corresponding
# get_key() function, which should be called before we
# create the RSS and HTML. See the existing examples.
my %FEEDS = (
'flickr' => 'http://www.flickr.com/services/feeds/photos_public.gne?id=35034346050@N01&format=rss_200',
'links' => 'http://www.gyford.com/phil/links/syndication/index.rdf',
'notes' => 'http://www.gyford.com/phil/notes/syndication/index.rdf',
'writing' => 'http://www.gyford.com/phil/writing/syndication/index.rdf'
);
# The filesystem path to the feed you want to create.
my $OUTPUT_FEED = '/path/to/gyford.com/docs/phil/syndication/index.rdf';
# How many items do you want in your feed (the n most recent).
my $ITEMS_FOR_FEED = 20;
# The filesystem path the HTML file you want to create.
my $OUTPUT_HTML = '/path/to/gyford.com/docs/includefiles/front_page.html';
# How many items do you want in your file (the n most recent).
my $ITEMS_FOR_HTML = 20;
# How many Flickr photos do you want appearing in a single day's entry?
# If there are more photos for the day, the maximum number (the most recent
# ones) will be displayed along with a link to Flickr indicating how many
# more photos remain.
my $MAX_PHOTOS_PER_DAY = 4;
# Your Flickr screen name.
my $FLICKR_SCREEN_NAME = 'philgyford';
# If one of the feeds can't be fetched the script uses a local cache.
# When a feed is successfully fetched it's saved to the local cache.
# This is the filesystem path to that local cache directory.
my $CACHE_DIR = '/path/to/a/cache/dir';
# Use cached of the feeds? Handy for testing, but you'll need to have
# fetched them successfully first!
my $USE_LOCAL = 'false';
# End config
######################################################
# This hash of hashes will contain all the items from all sources.
# The keys are made up of the unixtime of the item, and the appropriate
# key from %FEEDS, eg 'flickr', 'links', etc.
# eg: '1102167975_flickr'
# (Why not just unixtime keys? Because there might be two items from
# different sources at the same time. You never know.)
# Each item will have the following keys:
# title - Required
# link - Required
# description - Required
# html_description - Optional - used instead of 'description' in
# the HTML file.
# time - Required
# content:encoded - Optional - used in the RSS if present.
my %ITEMS;
# Get all the feed items and put them into %ITEMS...
get_flickr();
get_links();
get_notes();
get_writing();
# Create the files...
create_rss();
create_html();
exit;
sub get_flickr {
# Gets the items from the Flickr feed, groups them by day
# and creates a new entry in ITEMS for that day.
# By FAR the most complex feed because we need to do the daily grouping.
my $xml;
$xml = get_xml('flickr');
# Parse the file.
my $rss = new XML::RSS;
$rss->parse($xml);
# We keep track of the previous photo's day so we know when
# we reach a new day.
my $prev_photodate; # Will be a Date::Manip object.
my $prev_photoday = ''; # Will be yyyy-mm-d.
# We'll keep adding info about photos to this array until
# we reach a new day, at which point we'll create a new
# entry in the global %ITEMS hash for the old day, clear
# @todays_photos and start a new day.
# This is an array of hashes - each hash being about a photo.
my @todays_photos;
# Cycle through each photo in turn...
foreach my $item (@{$rss->{'items'}}) {
my $photodate = ParseDate($item->{'pubDate'});
my $photoday = UnixDate($photodate, "%Y-%m-%d");
if ($photoday ne $prev_photoday && $prev_photoday ne '') {
# A new day, so add all the previous day's items into a new 'entry'
# in %ITEMS to store for posterity.
# For the tag in the RSS.
my $day_description;
# Used in the HTML version instead of $day_description.
my $html_description;
my $num_photos_added_today = 0;
TODAYS_PHOTO: foreach my $todays_photo (@todays_photos) {
if ($num_photos_added_today == $MAX_PHOTOS_PER_DAY) {
# We don't want to add any more photos today.
# The number of photos for today we're not displaying.
my $remaining = scalar(@todays_photos) - $MAX_PHOTOS_PER_DAY;
my $plural = $remaining == 1 ? 'photo' : 'photos';
$day_description .= 'See ' . $remaining . " more $plural from this day
\n";
$html_description .= '\n";
last TODAYS_PHOTO;
}
# Extract the bits of the description we want.
my ($img_path, $width, $height) = ($todays_photo->{'description'} =~ m#src="(.*?)"\swidth="(\d+)"\sheight="(\d+)"#);
# The actual 'description' bit, not the title, the image tag but the
# paragraphs of description you can enter.
my ($img_description) = ($todays_photo->{'description'} =~ m#.*?
.*?
(.*?)
#s);
# Find the size of the thumbnail version.
# Flickr feed image (medium size) is max 240 width or height.
# Thumbnail is max 100.
# Flickr rounds UP when creating smaller size thumbs (hence ceil).
if ($width > $height) {
$width = 100;
$height = ceil ($height * (100/240));
} elsif ($height > $width) {
$height = 100;
$width = ceil ($width * (100/240));
} else {
$width = 100;
$height = 100;
}
# Use thumbnail image, rather than medium size.
$img_path =~ s/_m\.jpg/_t.jpg/;
$day_description .= '
' . "\n" . '' . $todays_photo->{'title'} . "";
$html_description .= '
\n";
if ($img_description) {
# We're not currently using the lengthy descriptions on the HTML page.
$day_description .= "
\n$img_description";
}
$day_description .= "
\n";
$num_photos_added_today++;
}
# Add all the previous day's photos to %ITEMS.
# The key is like '1102167975_flickr':
$ITEMS{ UnixDate($prev_photodate, "%s") . '_flickr' } = {
'title' => 'Photos for ' . UnixDate($prev_photodate, "%e %B %Y"),
'link' => 'http://www.flickr.com/photos/' . $FLICKR_SCREEN_NAME . '/archives/date-posted/' . UnixDate($prev_photodate, "%Y/%m/%d/detail/"),
'description' => $day_description,
'html_description' => $html_description,
'time' => UnixDate($prev_photodate, "%Y-%m-%dT") . '23:59:00' . timezone($photodate)
};
# Start afresh for a new day.
@todays_photos = ( );
}
push @todays_photos, {
'title' => $item->{'title'},
'link' => $item->{'link'},
'description' => $item->{'description'}
};
$prev_photodate = $photodate;
$prev_photoday = $photoday;
}
}
sub get_links {
# Nice and simple, get all the items from the links RSS and add them
# to %ITEMS.
my $xml;
$xml = get_xml('links');
# Parse the file.
my $rss = new XML::RSS;
$rss->parse($xml);
foreach my $item (@{$rss->{'items'}}) {
# The key is like '1102167975_links':
$ITEMS{ UnixDate($item->{'dc'}->{'date'}, "%s") . '_links' } = {
'title' => $item->{'title'},
'link' => $item->{'link'},
'description' => $item->{'description'},
'time' => $item->{'dc'}->{'date'}
};
}
}
sub get_notes {
# Nice and simple, get all the items from the notes RSS and add them
# to %ITEMS.
my $xml;
$xml = get_xml('notes');
# Parse the file.
my $rss = new XML::RSS;
$rss->add_module (
'prefix' => 'content',
'uri' => 'http://purl.org/rss/1.0/modules/content/'
);
$rss->parse($xml);
foreach my $item (@{$rss->{'items'}}) {
# The key is like '1102167975_writing':
$ITEMS{ UnixDate($item->{'dc'}->{'date'}, "%s") . '_notes' } = {
'title' => 'Notes: ' . $item->{'title'},
'link' => $item->{'link'},
'description' => $item->{'description'},
'content:encoded' => $item->{'content'}->{'encoded'},
'time' => $item->{'dc'}->{'date'}
};
}
}
sub get_writing {
# Nice and simple, get all the items from the writing RSS and add them
# to %ITEMS.
my $xml;
$xml = get_xml('writing');
# Parse the file.
my $rss = new XML::RSS;
$rss->add_module (
'prefix' => 'content',
'uri' => 'http://purl.org/rss/1.0/modules/content/'
);
$rss->parse($xml);
foreach my $item (@{$rss->{'items'}}) {
# The key is like '1102167975_writing':
$ITEMS{ UnixDate($item->{'dc'}->{'date'}, "%s") . '_writing' } = {
'title' => $item->{'title'},
'link' => $item->{'link'},
'description' => $item->{'description'},
'content:encoded' => $item->{'content'}->{'encoded'},
'time' => $item->{'dc'}->{'date'}
};
}
}
sub create_rss {
# Once we've populated %ITEMS, create the RSS file.
my $rss = new XML::RSS (version => '1.0');
$rss->channel (
'title' => "Site Name",
'link' => "http://www.yourdomain.com/",
'description' => "Description of your site",
'dc' => {
'date' => UnixDate('Now', "%Y-%m-%dT%H:%M:%S") . timezone('Now'),
'creator' => 'your@emailaddress',
'publisher' => 'your@emailaddress',
'language' => 'en-gb' # Or 'en-us', etc.
}
);
$rss->add_module (
'prefix' => 'content',
'uri' => 'http://purl.org/rss/1.0/modules/content/'
);
$rss->image (
'title' => 'Your name',
'url' => 'http://www.yourdomain.com/path/to/an/image.jpg',
'link' => 'http://www.yourdomain.com/'
);
my $count = 0;
ITEM: foreach my $timesource (sort {$b cmp $a} keys %ITEMS) {
if ($count == $ITEMS_FOR_FEED) {
last ITEM;
}
my %item = (
'title' => $ITEMS{$timesource}{'title'},
'link' => $ITEMS{$timesource}{'link'},
'description' => $ITEMS{$timesource}{'description'},
'dc' => {
'creator' => 'Your Name',
'date' => $ITEMS{$timesource}{'time'}
}
);
if ($ITEMS{$timesource}{'content:encoded'}) {
$item{'content'}{'encoded'} = '';
}
$rss->add_item ( %item );
$count++;
}
$rss->save($OUTPUT_FEED);
}
sub create_html {
# Once we've populated %ITEMS, create the HTML file.
# This will contain the html we output.
my $html = '';
open (HTML_FILE, ">$OUTPUT_HTML") or die "Can't open $OUTPUT_HTML: $!";
# These are so we can keep track of which day we're on.
# When we hit an item on a new day, we can print a date header.
my $prev_itemdate; # Will be a Date::Manip object.
my $prev_itemday = ''; # Will be yyyy-mm-d.
my $count = 0;
ITEM: foreach my $timesource (sort {$b cmp $a} keys %ITEMS) {
if ($count == $ITEMS_FOR_HTML) {
# We've added enough.
last ITEM;
}
my $itemdate = ParseDate($ITEMS{$timesource}{'time'});
my $itemday = UnixDate($itemdate, "%Y-%m-%d");
# $item_type will be like 'writing', 'flickr', 'links', etc.
# So we can do different things for each type if needed.
my ($unixtime, $item_type) = split /_/, $timesource;
# Not currently used - we could output a date header here.
# if ($itemday ne $prev_itemday) {
# # A new day, so create a new date header.
# if ($prev_itemday ne '') {
# $html .= "\t\t\n\t\n";
# }
#
# $html .= "\t" . UnixDate($itemdate, "%e %B %Y") . "\n\t\n\t\t\n";
# }
$html .= "\t\t- ";
$html .= '' . $ITEMS{$timesource}{'title'} . ' Permalink';
$html .= "
\n\t\t- ";
if ($ITEMS{$timesource}{'html_description'}) {
# There's a special HTML version of the description.
$html .= $ITEMS{$timesource}{'html_description'};
} else {
$html .= $ITEMS{$timesource}{'description'};
}
$html .= "\t\t
\n";
$prev_itemdate = $itemdate;
$prev_itemday = $itemday;
$count++;
}
$html = "\t\n$html
\n";
print HTML_FILE $html;
close HTML_FILE;
}
####################################################
# Functions used by the main functions above.
sub get_xml {
# Fetches the XML file identified by $FEEDS{$type}.
# May get the local version if we're using local, or can't
# get the remote one.
my ($type) = @_;
check_valid_type($type, 'get_xml');
my $xml = '';
# Get the file.
if ($USE_LOCAL eq 'true') {
$xml = get_local_feed($type);
} else {
my $feed_url = $FEEDS{$type};
if (defined ($xml = get $feed_url)) {
write_local_feed($type, $xml);
} else {
print "Couldn't get $feed_url - using local cache.\n";
$xml = get_local_feed($type);
}
}
return $xml;
}
sub write_local_feed {
# When we've got the remote feed we'll write a copy locally.
my ($type, $text) = @_;
check_valid_type($type, 'write_local_feed');
open (FEED, "> $CACHE_DIR/$type.txt")
or die "Couldn't open $CACHE_DIR/$type.txt for writing: $!\n";
print FEED $text;
close (FEED);
}
sub get_local_feed {
# Fetches the contents of the cached copy of a feed.
my ($type) = @_;
check_valid_type($type, 'get_local_feed');
open (FEED, "< $CACHE_DIR/$type.txt")
or die "Couldn't open $CACHE_DIR/$type.txt for reading: $!\n";
# Read whole contents into a string.
undef $/;
my $text = ;
close (FEED);
return $text;
}
sub check_valid_type {
# Checks that $type is valid.
my ($type, $function) = @_;
if (!exists $FEEDS{$type}) {
die "'$type' is not a valid feed type in $function.\n";
}
}
sub timezone {
my $time = shift(@_);
# UnixDate only does timezones like '+0100' and we need '+01:00'.
# So we have to fix it manually here. Ugh.
my $timezone = UnixDate($time, "%z");
$timezone =~ s/(...)(..)/$1:$2/;
return $timezone;
}