Revision 27 (by dpavlin, 2007/12/15 13:52:40) create tags from parts of page name
#!/usr/bin/perl

use warnings;
use strict;

use XML::Simple;
use File::Find;
use Regexp::Common qw/balanced/;
use Socialtext::Resting;
use Encode;
use HTTP::Date;
use POSIX qw/strftime/;
use File::Slurp;
use File::MMagic::XS;
use Getopt::Long;
use Data::Dump qw/dump/;

my $debug = 0;
my $max = 999;
my $attachments = 0;

my @create_tags = (qw/
Trazi
Nudi
SvakodnevneDovitljivosti
G33koSkop
/);

GetOptions(
	'debug+' => \$debug,
	'max=i' => \$max,
	'attachments' => \$attachments,
);

my $page;
my $page_date;

my @page_names;

print "Importing $max pages", $attachments ? " with attachments" : "", "...\n";

find({
	wanted => sub {
		my $path = $File::Find::name;
		return unless -f $path;

		warn "+ $path\n";
		my $ref = XMLin( $path,
			KeyAttr => {
				'attachment' => '+name',
				'meta' => 'name',
			},
			ForceArray => [ 'attachment', 'meta', 'widget' ],
		) || die "can't open $path: $!";

		warn "## $path = ",dump( $ref ) if $debug;

		my $name = $ref->{name} || die "no name in $path";

		return if $name =~ m/^TamSystem/;

		my $date =  $ref->{meta}->{LastModified}->{value};
		if ( ! $date ) {
			warn "SKIP: no LastModified in $path $name";
			return;
		}

		my $data;

		foreach my $w ( @{ $ref->{widgets}->{widget} } ) {

			warn "## w = ",dump( $w ) if $debug;

			$data .= "\n----\n" if $data;
			$data .= $w->{data} || die "no data?";
		}

		my $attachments;

		if ( my $a = $ref->{attachment} ) {
			foreach my $name ( keys %$a ) {
				my $full_path = $path;
				$full_path =~ s,pages/,attachments/,;
				$full_path .= '.' . $name;
				die "$full_path doesn't exist" unless -e $full_path;
				push @$attachments, {
					full_path => $full_path,
					name => ( $name || $a->{$name}->{desc} || 'noname' ),
				};
			}
		}

		$page->{ $name } = {
			content => convert_markup( $data ),
			original => $data,
			date => convert_date( $date ),
			attachments => $attachments,
		};

		$name =~ s,^.+/([^/]+)$,$1,;
		push @page_names, $name;

	},
	no_chdir=>1,
}, shift @ARGV || '.');

my @pages = ( keys %$page );

warn "found following pages: ", join(", ", @page_names),"\n";

my $page_link_re = '\b(' . join('|', @page_names) . ')\b';

my $Rester = Socialtext::Resting->new(
	username => 'tamtam',
	password => 'import',
	server   => 'http://saturn.ffzg.hr/',
	workspace => 'razmjenavjestina',
);
$Rester->put_workspacetag('TamTam');

sub convert_date {
	my $date = shift;
#	return time2str( $date );
	return strftime('%F %T %z', gmtime( $date ));
}

sub header {
	my $h = shift;
	if ( $h =~ m/^(=+)\s+(.+?)\s+\1$/ ) {
		my $level = length($1);
		return "\n" . ( '^' x $level ) . " $2\n";
	} else {
		return $h;
	}
}

sub surround {
	my ( $with, $what ) = @_;
	return $with . $what . $with;
}

sub pre {
	my $text = shift;
	$text =~ s/^{{{\s*//s;
	$text =~ s/\s*}}}$//s;
	return "\n.pre\n" . $text . "\n.pre\n";
}

sub convert_markup {
	my $body = shift;

	$body =~ s/\Q[[TableOfContents]]\E/{toc}/gs;
	$body =~ s/\Q[[BR]]\E/\n/gs;
	$body =~ s/$RE{balanced}{-begin => "= |== |=== |==== |===== |===== "}{-end => " =| ==| ===| ====| ====="}{-keep}/header($1)/gse;
	$body =~ s/''''(.+?)''''/surround('`',$1)/gse;
	$body =~ s/'''(.+?)'''/surround('*',$1)/gse;
	$body =~ s/''(.+?)''/surround('_',$1)/gse;
	$body =~ s/$RE{balanced}{-begin => "{{{"}{-end => "}}}"}{-keep}/pre($1)/gse;

	# fix bullets
	$body =~ s/^\s+([\*])/$1/gm;

	# fix links
	$body =~ s/\["([^"]+)"\]/[$1]/gs;
	$body =~ s,\[(http://\S+)\s+([^\]]+)\],"$2"<$1>,gs;
	$body =~ s,\[(http://[^\]]+)\],$1,gs;

	# fix hr
	$body =~ s,(\S+)----,$1\n----,gs;
	$body =~ s,----(\S+),----\n$1,gs;

	# attachments
	$body =~ s,\[attachment:([^\]]+)(gif|png|jpg|jpeg)\],{image: $1$2},gis;
	$body =~ s,\[attachment:([^\]]+)\],{file: $1},gs;

	return $body;
}

my $count = 0;

my $m = File::MMagic::XS->new;

foreach my $name ( keys %$page ) {
	last if $count++ == $max;

	my $p = $page->{$name};

	warn "## $name = ",dump( $p ) if $debug;

	my $body = $p->{content} || die "no content?";
	my $date = $p->{date} || die "no date?";

	my @tags = ( 'TamTam' );

	my $full_name = $name;

	if ( $name =~ m!/! ) {
		my @page_tags = split(m!/!, $name);
		$name = pop @page_tags; # remove page name
		push @tags, @page_tags;
	}

	# link named pages
	$body =~ s,\b$page_link_re\b,[$1],gs;
	$body =~ s,``,,gs;

	$body .= qq{

----

"original"<http://www.razmjenavjestina.org/$full_name> {date: $date}
};

	Encode::_utf8_off( $body );

	print "$name $date\n";

	# original markup
	$Rester->put_page( $name, { content => $p->{original}, date => $date });

	foreach my $t ( @create_tags ) {
		push @tags, $t if $full_name =~ m/$t/i;
	}

	foreach ( @tags ) {
		$Rester->put_pagetag( $name, $_, { date => $date } );
		print "+ tag $_\n";
	}

	if ( $attachments ) {
		foreach my $a ( @{ $p->{attachments} } ) {
			my $type = $m->get_mime( $a->{full_path} );
			my $content = read_file( $a->{full_path} );
			print "+ attachment ", $a->{name}," $type ", length($content), " bytes\n";
			$Rester->post_attachment($name, $a->{name}, $content, $type );
		}
	}

	# converted page
	$Rester->put_page( $name, { content => $body, date => $date });

}