/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory | Revision Log | View Patch Patch

-revision 69 by laperla,
Fri Jan 25 07:27:30 2002 UTC
+revision 72 by laperla,
Mon Jan 28 21:35:39 2002 UTC
 Line 1
  #!/usr/bin/perl
  #                              -*- Mode: Perl -*-
  # $Basename: HTML.pm $
- # $Revision: 1.5 $
+ # $Revision: 1.6 $
  # Author          : Ulrich Pfeifer with Andreas König
  # Created On      : Sat Nov 1 1997
  # Last Modified By: Ulrich Pfeifer
 Line 22 
 use vars qw(@ISA);
  @ISA = qw(WAIT::Parse::Base);
  my $debug = 0;
- my %text = (
+ my %is_text = (
              p     => 'text',
  #            h1    => 'text',
  #            h2    => 'text',
 Line 43 
 my $open;
  sub handle_start {
    my $tag = shift;
+   my $attr = shift;
-   return unless $text{$tag};
+   return unless
+       $is_text{$tag}                 # well-formed paragraphs
+       ||
+           $tag eq "h3"               # good for desc, author, and colo
+       ||
+           ($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html
    $open++;
    print ">" x $open, $tag,  "\n" if $debug;
  }
-Line 52 
 sub handle_start {
+Line 58 
 sub handle_start {
  sub handle_end {
    my $tag = shift;
-   return unless $text{$tag};
+   return unless $is_text{$tag};
    print "<" x $open, $tag,  "\n" if $debug;
    $open--;
    $text =~ s/^\s+//;
    $text =~ s/\s+$//;
    $text =~ s/\s+/ /g;
-   $result{$text{$tag}} .= $text . ' ';
+   $result{$is_text{$tag}} .= $text . ' ';
    $text = '';
  }
  sub handle_text {
-   $text .= $_[0] if $open;
+   my $c = shift;
+   if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) {
+     $open--;
+     return;
+   }
+   $text .= $c if $open;
  }
  sub my_parse ($) {
-Line 80 
 sub my_parse ($) {
+Line 91 
 sub my_parse ($) {
  sub split {
    my ($self, $doc) = @_;
-   my %doc = ( isbn => '', author => '', about => '', colophon => '' );
+   my %doc = ( isbn => '',
-   my $desc = $doc->{desc};
+               author => '',
-   my $auth = $doc->{author};
+               aboutauthor => '',
-   my $colophon = $doc->{colophon};
+               colophon => '',
+               abstract => ''
+             );
    if ($doc->{author}) {
      %result = ();
-Line 91 
 sub split {
+Line 104 
 sub split {
      $open = 0;
      my_parse($doc->{author});
      $doc{author} = $result{title};
-     $doc{author} =~ s/^By\s+//;
+     $doc{aboutauthor}  = $result{text};
-     $doc{about}  = $result{text};
    }
    if ($doc->{index}) {
-     $doc->{index} =~ /ISBN\s*([^<]+)/ and $doc{isbn} = $1;
+     $doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1;
+     %result = ();
+     $text = '';
+     $open = 0;
+     my_parse($doc->{index});
+     $doc{abstract} = $result{text};
    }
    if ($doc->{colophon}) {
      %result = ();
-Line 110 
 sub split {
+Line 127 
 sub split {
    my_parse($doc->{desc});
-   $doc{text}  = $result{text};
+   $doc{desc}  = $result{text};
    $doc{title} = $result{title};
    while (my($k,$v) = each %doc) {
-Line 118 
 sub split {
+Line 135 
 sub split {
      $doc{$k} = $utf8v;
    }
+   $doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//;
+   $doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /;
+   $doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /;
+   # warn "desc[$doc{desc}]";
+   # warn "abstract[$doc{abstract}]"; # zu viel, zu viel!
    return \%doc;
  }

 Legend:



Removed from v.69
 


changed lines


 
Added in v.72
 Legend:



Removed from v.69
 


changed lines


 
Added in v.72
-Removed from v.69
+Added in v.72

	ViewVC Help
Powered by ViewVC 1.1.26