WAIT/Parse/Ora.pm

#!/usr/bin/perl
#                              -*- Mode: Perl -*- 
# $Basename: HTML.pm $
# $Revision: 1.6 $
# Author          : Ulrich Pfeifer with Andreas König
# Created On      : Sat Nov 1 1997
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Fri Jan  4 16:06:14 2002
# Language        : CPerl
# Update Count    : 14
# Status          : Unknown, Use with caution!
# 
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
# 
# 

package WAIT::Parse::Ora;
use HTML::Parser;
use Encode;
use strict;
use vars qw(@ISA);
@ISA = qw(WAIT::Parse::Base);

my $debug = 0;
my %is_text = (
            p     => 'text',
#            h1    => 'text',
#            h2    => 'text',
#            h3    => 'text',
            title => 'title',
           );

my $p = HTML::Parser->new(
                          api_version => 3,
                          start_h => [\&handle_start, "tagname, attr"],
                          end_h   => [\&handle_end,   "tagname"],
                          text_h  => [\&handle_text,  "dtext"],
                          marked_sections => 1,
                         );
my %result;
my $text;
my $open;

sub handle_start {
  my $tag = shift;
  my $attr = shift;

  return unless
      $is_text{$tag}                 # well-formed paragraphs
      ||
          $tag eq "h3"               # good for desc, author, and colo
      ||
          ($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html
  $open++;
  print ">" x $open, $tag,  "\n" if $debug;
}

sub handle_end {
  my $tag = shift;

  return unless $is_text{$tag};
  print "<" x $open, $tag,  "\n" if $debug;
  $open--;
  $text =~ s/^\s+//;
  $text =~ s/\s+$//;
  $text =~ s/\s+/ /g;
  $result{$is_text{$tag}} .= $text . ' ';
  $text = '';
}


sub handle_text {
  my $c = shift;
  if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) {
    $open--;
    return;
  }
  $text .= $c if $open;
}

sub my_parse ($) {
  my($s) = @_;
  my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
                                                # LATIN for entities
                                                # and we would get
                                                # mixed content in
                                                # result
  $p->parse($ls);
  $p->eof;
}

sub split {
  my ($self, $doc) = @_;
  my %doc = ( isbn => '',
              author => '',
              aboutauthor => '',
              colophon => '',
              abstract => ''
            );

  if ($doc->{author}) {
    %result = ();
    $text = '';
    $open = 0;
    my_parse($doc->{author});
    $doc{author} = $result{title};
    $doc{aboutauthor}  = $result{text};
  }
  if ($doc->{index}) {
    $doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1;
    %result = ();
    $text = '';
    $open = 0;
    my_parse($doc->{index});
    $doc{abstract} = $result{text};
  }
  if ($doc->{colophon}) {
    %result = ();
    $text = '';
    $open = 0;
    my_parse($doc->{colophon});
    $doc{colophon} = $result{text};
  }
  %result = ();
  $text = '';
  $open = 0;

  my_parse($doc->{desc});

  $doc{desc}  = $result{text};
  $doc{title} = $result{title};

  while (my($k,$v) = each %doc) {
    my $utf8v = Encode::decode("ISO-8859-1",$v);
    $doc{$k} = $utf8v;
  }

  $doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//;
  $doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /;
  $doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /;
  # warn "desc[$doc{desc}]";
  # warn "abstract[$doc{abstract}]"; # zu viel, zu viel!

  return \%doc;
}

1;
1	#!/usr/bin/perl
2	# -- Mode: Perl --
3	# $Basename: HTML.pm $
4	# $Revision: 1.6 $
5	# Author : Ulrich Pfeifer with Andreas König
6	# Created On : Sat Nov 1 1997
7	# Last Modified By: Ulrich Pfeifer
8	# Last Modified On: Fri Jan 4 16:06:14 2002
9	# Language : CPerl
10	# Update Count : 14
11	# Status : Unknown, Use with caution!
12	#
13	# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14	#
15	#
16
17	package WAIT::Parse::Ora;
18	use HTML::Parser;
19	use Encode;
20	use strict;
21	use vars qw(@ISA);
22	@ISA = qw(WAIT::Parse::Base);
23
24	my $debug = 0;
25	my %is_text = (
26	p => 'text',
27	# h1 => 'text',
28	# h2 => 'text',
29	# h3 => 'text',
30	title => 'title',
31	);
32
33	my $p = HTML::Parser->new(
34	api_version => 3,
35	start_h => [\&handle_start, "tagname, attr"],
36	end_h => [\&handle_end, "tagname"],
37	text_h => [\&handle_text, "dtext"],
38	marked_sections => 1,
39	);
40	my %result;
41	my $text;
42	my $open;
43
44	sub handle_start {
45	my $tag = shift;
46	my $attr = shift;
47
48	return unless
49	$is_text{$tag} # well-formed paragraphs
50	\|\|
51	$tag eq "h3" # good for desc, author, and colo
52	\|\|
53	($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html
54	$open++;
55	print ">" x $open, $tag, "\n" if $debug;
56	}
57
58	sub handle_end {
59	my $tag = shift;
60
61	return unless $is_text{$tag};
62	print "<" x $open, $tag, "\n" if $debug;
63	$open--;
64	$text =~ s/^\s+//;
65	$text =~ s/\s+$//;
66	$text =~ s/\s+/ /g;
67	$result{$is_text{$tag}} .= $text . ' ';
68	$text = '';
69	}
70
71
72	sub handle_text {
73	my $c = shift;
74	if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu\|Erg.{1,6}nzende O'Reilly Titel)/) {
75	$open--;
76	return;
77	}
78	$text .= $c if $open;
79	}
80
81	sub my_parse ($) {
82	my($s) = @_;
83	my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
84	# LATIN for entities
85	# and we would get
86	# mixed content in
87	# result
88	$p->parse($ls);
89	$p->eof;
90	}
91
92	sub split {
93	my ($self, $doc) = @_;
94	my %doc = ( isbn => '',
95	author => '',
96	aboutauthor => '',
97	colophon => '',
98	abstract => ''
99	);
100
101	if ($doc->{author}) {
102	%result = ();
103	$text = '';
104	$open = 0;
105	my_parse($doc->{author});
106	$doc{author} = $result{title};
107	$doc{aboutauthor} = $result{text};
108	}
109	if ($doc->{index}) {
110	$doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1;
111	%result = ();
112	$text = '';
113	$open = 0;
114	my_parse($doc->{index});
115	$doc{abstract} = $result{text};
116	}
117	if ($doc->{colophon}) {
118	%result = ();
119	$text = '';
120	$open = 0;
121	my_parse($doc->{colophon});
122	$doc{colophon} = $result{text};
123	}
124	%result = ();
125	$text = '';
126	$open = 0;
127
128	my_parse($doc->{desc});
129
130	$doc{desc} = $result{text};
131	$doc{title} = $result{title};
132
133	while (my($k,$v) = each %doc) {
134	my $utf8v = Encode::decode("ISO-8859-1",$v);
135	$doc{$k} = $utf8v;
136	}
137
138	$doc{desc} =~ s/^\sAusf\S+hrliche\s+Beschreibung\s//;
139	$doc{abstract} =~ s/\sTitel\s+dem\s+Warenkorb\s+hinzu\S+\s/ /;
140	$doc{abstract} =~ s/\sWarenkorb\s+anzeigen\s/ /;
141	# warn "desc[$doc{desc}]";
142	# warn "abstract[$doc{abstract}]"; # zu viel, zu viel!
143
144	return \%doc;
145	}
146
147	1;