Some free code for parsing RSS documents
I'm working on another plugin for Movable Type, and I wrote some decent Perl code for parsing RSS 2.0 documents. Here it is, so anyone can use it if they need to do something similar.
This converts a date taken from a RSS feed and converts it into a format suitable for database storage:
sub buildSqlDate
{
my $str = shift;
my @pieces = split(/ /, $str);
my $retVal;
my $day = $pieces[1];
my $month;
if ($pieces[2] eq "Jan") { $month = 1; }
elsif ($pieces[2] eq "Feb") { $month = 2; }
elsif ($pieces[2] eq "Mar") { $month = 3; }
elsif ($pieces[2] eq "Apr") { $month = 4; }
elsif ($pieces[2] eq "May") { $month = 5; }
elsif ($pieces[2] eq "Jun") { $month = 6; }
elsif ($pieces[2] eq "Jul") { $month = 7; }
elsif ($pieces[2] eq "Aug") { $month = 8; }
elsif ($pieces[2] eq "Sept") { $month = 9; }
elsif ($pieces[2] eq "Oct") { $month = 10; }
elsif ($pieces[2] eq "Nov") { $month = 11; }
elsif ($pieces[2] eq "Dec") { $month = 12; }
my $year = $pieces[3];
my $time = split(/\:/, $pieces[4]);
my $hour = $time[0];
my $min = $time[1];
my $sec = $time[2];
my $ts = $pieces[5];
$retVal = "$year-$month-$day $hour:$min:$sec";
return $retVal;
}
This will process a RSS 2.0 document:
sub processRSS20
{
my $doc = shift;
my $channelNumber = shift;
my @channels = $doc->getElementsByTagName("channel");
unless (length(@channels) >= 1) { die("No channels found in the feed, stopping"); }
my $channel = $channels[$channelNumber];
my $copyright = $channel->getElementsByTagName("copyright")->[0]->getFirstChild()->getData();
my $description = $channel->getElementsByTagName("description")->[0]->getFirstChild()->getData();
my $docs = $channel->getElementsByTagName("docs")->[0]->getFirstChild()->getData();
my @entries;
my @entriesTags = $channel->getElementsByTagName("item");
my $generator = $channel->getElementsByTagName("generator")->[0]->getFirstChild()->getData();
my $language = $channel->getElementsByTagName("language")->[0]->getFirstChild()->getData();
my $lastBuildDate = $channel->getElementsByTagName("lastBuildDate")->[0]->getFirstChild()->getData();
my $link = $channel->getElementsByTagName("link")->[0]->getFirstChild()->getData();
my $title = $channel->getElementsByTagName("title")->[0]->getFirstChild()->getData();
my $etr = @entriesTags;
foreach my $entry (@entriesTags)
{
my %data = {};
my @catArray;
$data{title} = $entry->getElementsByTagName("title")->[0]->getFirstChild()->getData();
$data{description} = $entry->getElementsByTagName("description")->[0]->getFirstChild()->getData();
$data{"link"} = $entry->getElementsByTagName("link")->[0]->getFirstChild()->getData();
$data{guid} = $entry->getElementsByTagName("guid")->[0]->getFirstChild()->getData();
$data{pubDate} = $entry->getElementsByTagName("pubDate")->[0]->getFirstChild()->getData();
my @categories = $entry->getElementsByTagName("category");
foreach my $category (@categories)
{
push(@catArray, $category);
}
$data{categories} = \@catArray;
my $etr = @entries;
}
my %retVal;
$retVal{copyright} = $copyright;
$retVal{description} = $description;
$retVal{docs} = $docs;
$retVal{entries} = \@entries;
$retVal{generator} = $generator;
$retVal{language} = $language;
$retVal{lastBuildDate} = $lastBuildDate;
$retVal{"link"} = $link;
$retVal{"title"} = $title;
return \%retVal;
}
I've tested it locally on a copy of my blog's RSS feed. It should work fine for everyone else for basic parsing.

