| Revision 83 (by dpavlin, 2007/01/17 22:30:14) |
restructure IMDB trivia parser, added db target to create
trivia database in PostgreSQL |
use strict;
my $header = 1;
my ($title, $year) = ('',undef);
my $in = 0;
my $trivia = '';
my @qv;
my ($all_years,$all_titles,$all_quotes);
sub qv {
my $t = shift || return;
sub qv_print {
my $v = shift || return '';
# $v =~ s/(.*)\s*,\s*(.+)/$2 $1/g;
push @qv, $v;
$all_quotes->{$v}++;
print "+ $v ";
return '';
}
$t =~ s#([_'"])([^_'"]+?)\1 \(qv\)#qv_print($2)#ge;
}
sub fix_title {
my $t = shift;
$t =~ s/,\s+The\s*$//;
return $t;
};
my $i = 0;
sub parse_trivia {
my ($t, $call) = @_;
while(<$t>) {
if ($header && /^=====+/) {
$header = 0;
next;
}
next if $header;
if (/^#\s+(.*)\s*$/) {
$title = $1;
if ($title =~ m#^("*)(.*)\1\s*\((\d+)\)(:?\s*\(\w+\))*$#) {
($title, $year) = (fix_title($2),$3);
$all_titles->{$title}++;
$all_years->{$3}++;
} else {
$year = undef;
}
print "# $title ", ( $year ? "[$year]" : "" ), "\n";
next;
} elsif (/^-\s(.*)\s*$/) {
$in = 1;
$trivia = "$1\n";
qv($1);
} elsif (/^\s\s(.*)\s*$/) {
$trivia .= "$1\n";
qv($1);
} elsif (/^$/ && $in) {
$i++;
print "[$i] ";
$call->(
title => $title,
year => $year,
trivia => $trivia,
qv => [ @qv ],
);
$trivia = '';
@qv = ();
$in = 0;
} else {
print "#$_\n";
}
# last if ($i > 1000); # XXX remove this!
}
}
sub dump_data($$) {
my ($name,$hash) = @_;
open(my $fh, "> $name") || die "can't open $name: $!";
foreach my $k (sort keys %{$hash}) {
print $fh "$k\t",$hash->{$k},"\n";
}
close($fh);
}
#dump_data('titles.data', $all_titles);
#dump_data('quotes.data', $all_quotes);
#dump_data('years.data', $all_years);
1;