Revision 83 (by dpavlin, 2007/01/17 22:30:14) restructure IMDB trivia parser, added db target to create
trivia database in PostgreSQL
use strict;

my $header = 1;
my ($title, $year) = ('',undef);
my $in = 0;
my $trivia = '';
my @qv;

my ($all_years,$all_titles,$all_quotes);

sub qv {
	my $t = shift || return;

	sub qv_print {
		my $v = shift || return '';
#		$v =~ s/(.*)\s*,\s*(.+)/$2 $1/g;
		push @qv, $v;
		$all_quotes->{$v}++;
		print "+ $v ";
		return '';
	}
	$t =~ s#([_'"])([^_'"]+?)\1 \(qv\)#qv_print($2)#ge;
}

sub fix_title {
	my $t = shift;
	$t =~ s/,\s+The\s*$//;
	return $t;
};

my $i = 0;

sub parse_trivia {
	my ($t, $call) = @_;

	while(<$t>) {
		if ($header && /^=====+/) {
			$header = 0;
			next;
		}
		next if $header;

		if (/^#\s+(.*)\s*$/) {
			$title = $1;
			if ($title =~ m#^("*)(.*)\1\s*\((\d+)\)(:?\s*\(\w+\))*$#) {
				($title, $year) = (fix_title($2),$3);
				$all_titles->{$title}++;
				$all_years->{$3}++;
			} else {
				$year = undef;
			}

			print "# $title ", ( $year ? "[$year]" : "" ), "\n";
			next;

		} elsif (/^-\s(.*)\s*$/) {
			$in = 1;
			$trivia = "$1\n";
			qv($1);
		} elsif (/^\s\s(.*)\s*$/) {
			$trivia .= "$1\n";
			qv($1);
		} elsif (/^$/ && $in) {
			$i++;
			print "[$i] ";

			$call->(
				title => $title,
				year => $year,
				trivia => $trivia,
				qv => [ @qv ],
			);

			$trivia = '';
			@qv = ();
			$in = 0;
		} else {
			print "#$_\n";
		}

		#	last if ($i > 1000);	# XXX remove this!
	}
}

sub dump_data($$) {
	my ($name,$hash) = @_;

	open(my $fh, "> $name") || die "can't open $name: $!";

	foreach my $k (sort keys %{$hash}) {
		print $fh "$k\t",$hash->{$k},"\n";
	}

	close($fh);
}

#dump_data('titles.data', $all_titles);
#dump_data('quotes.data', $all_quotes);
#dump_data('years.data', $all_years);

1;