/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory | Revision Log | View Patch Patch

-revision 65 by laperla,
Wed Jan 23 12:22:54 2002 UTC
+revision 73 by laperla,
Tue Mar  5 13:40:38 2002 UTC
 Line 1
  #!/usr/bin/perl
  #                              -*- Mode: Perl -*-
  # $Basename: HTML.pm $
- # $Revision: 1.4 $
+ # $Revision: 1.7 $
  # Author          : Ulrich Pfeifer with Andreas König
  # Created On      : Sat Nov 1 1997
  # Last Modified By: Ulrich Pfeifer
 Line 16
  package WAIT::Parse::Ora;
  use HTML::Parser;
+ use Encode;
  use strict;
  use vars qw(@ISA);
  @ISA = qw(WAIT::Parse::Base);
+ =pod
+ Text from 2002-03-05 is structured with <div> tags as follows:
+   index.html:
+   <div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
+   <div id="short_desc"> KURZE BESCHREIBUNG
+   desc.html:
+   <div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG
+   author.html:
+   <div id="author_bio"> BIOGRAPHIE DES AUTOREN
+   translator.html:
+   <div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS
+ =cut
  my $debug = 0;
- my %text = (
+ my %is_text = (
-             p     => 'text',
+                p     => 'text',
+                a     => 'text', # uebersetzer
  #            h1    => 'text',
  #            h2    => 'text',
  #            h3    => 'text',
-             title => 'title',
+                title => 'title',
             );
  my $p = HTML::Parser->new(
-Line 42 
 my $open;
+Line 64 
 my $open;
  sub handle_start {
    my $tag = shift;
+   my $attr = shift;
-   return unless $text{$tag};
+   return unless
+       $is_text{$tag}                 # well-formed paragraphs
+       ||
+           $tag eq "h3"               # good for desc, author, and colo
+       ||
+           ($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html
    $open++;
    print ">" x $open, $tag,  "\n" if $debug;
  }
-Line 51 
 sub handle_start {
+Line 79 
 sub handle_start {
  sub handle_end {
    my $tag = shift;
-   return unless $text{$tag};
+   return unless $is_text{$tag};
    print "<" x $open, $tag,  "\n" if $debug;
    $open--;
    $text =~ s/^\s+//;
    $text =~ s/\s+$//;
    $text =~ s/\s+/ /g;
-   $result{$text{$tag}} .= $text . ' ';
+   $result{$is_text{$tag}} .= $text . ' ';
    $text = '';
  }
  sub handle_text {
-   $text .= $_[0] if $open;
+   my $c = shift;
+   if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) {
+     $open--;
+     return;
+   }
+   $text .= $c if $open;
+ }
+ sub my_parse ($) {
+   my($s) = @_;
+   my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
+                                                 # LATIN for entities
+                                                 # and we would get
+                                                 # mixed content in
+                                                 # result
+   $p->parse($ls);
+   $p->eof;
  }
  sub split {
    my ($self, $doc) = @_;
-   my %doc = ( isbn => '', author => '', about => '', colophon => '' );
+   my %doc = ( isbn => '',
-   my $desc = $doc->{desc};
+               author => '',
-   my $auth = $doc->{author};
+               aboutauthor => '',
-   my $colophon = $doc->{colophon};
+               colophon => '',
+               abstract => ''
+             );
    if ($doc->{author}) {
      %result = ();
      $text = '';
      $open = 0;
-     $p->parse($doc->{author});
+     my_parse($doc->{author});
-     $p->eof;
      $doc{author} = $result{title};
-     $doc{author} =~ s/^By\s+//;
+     $doc{aboutauthor}  = $result{text};
-     $doc{about}  = $result{text};
    }
    if ($doc->{index}) {
-     $doc->{index} =~ /ISBN\s*([^<]+)/ and $doc{isbn} = $1;
+     $doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1;
+     %result = ();
+     $text = '';
+     $open = 0;
+     my_parse($doc->{index});
+     $doc{abstract} = $result{text};
    }
    if ($doc->{colophon}) {
      %result = ();
      $text = '';
      $open = 0;
-     $p->parse($doc->{colophon});
+     my_parse($doc->{colophon});
-     $p->eof;
      $doc{colophon} = $result{text};
    }
    %result = ();
    $text = '';
    $open = 0;
-   $p->parse($doc->{desc});
+   my_parse($doc->{desc});
-   $p->eof;
-   $doc{text}  = $result{text};
+   $doc{desc}  = $result{text};
    $doc{title} = $result{title};
+   while (my($k,$v) = each %doc) {
+     my $utf8v = Encode::decode("ISO-8859-1",$v);
+     $doc{$k} = $utf8v;
+   }
+   $doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//;
+   $doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /;
+   $doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /;
+   # warn "desc[$doc{desc}]";
+   # warn "abstract[$doc{abstract}]"; # zu viel, zu viel!
    return \%doc;
  }

 Legend:



Removed from v.65
 


changed lines


 
Added in v.73
 Legend:



Removed from v.65
 


changed lines


 
Added in v.73
-Removed from v.65
+Added in v.73

	ViewVC Help
Powered by ViewVC 1.1.26