/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 71 by laperla, Fri Jan 25 07:27:30 2002 UTC revision 72 by laperla, Mon Jan 28 21:35:39 2002 UTC
# Line 1  Line 1 
1  #!/usr/bin/perl  #!/usr/bin/perl
2  #                              -*- Mode: Perl -*-  #                              -*- Mode: Perl -*-
3  # $Basename: HTML.pm $  # $Basename: HTML.pm $
4  # $Revision: 1.5 $  # $Revision: 1.6 $
5  # Author          : Ulrich Pfeifer with Andreas König  # Author          : Ulrich Pfeifer with Andreas König
6  # Created On      : Sat Nov 1 1997  # Created On      : Sat Nov 1 1997
7  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
# Line 22  use vars qw(@ISA); Line 22  use vars qw(@ISA);
22  @ISA = qw(WAIT::Parse::Base);  @ISA = qw(WAIT::Parse::Base);
23    
24  my $debug = 0;  my $debug = 0;
25  my %text = (  my %is_text = (
26              p     => 'text',              p     => 'text',
27  #            h1    => 'text',  #            h1    => 'text',
28  #            h2    => 'text',  #            h2    => 'text',
# Line 43  my $open; Line 43  my $open;
43    
44  sub handle_start {  sub handle_start {
45    my $tag = shift;    my $tag = shift;
46      my $attr = shift;
47    
48    return unless $text{$tag};    return unless
49          $is_text{$tag}                 # well-formed paragraphs
50          ||
51              $tag eq "h3"               # good for desc, author, and colo
52          ||
53              ($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html
54    $open++;    $open++;
55    print ">" x $open, $tag,  "\n" if $debug;    print ">" x $open, $tag,  "\n" if $debug;
56  }  }
# Line 52  sub handle_start { Line 58  sub handle_start {
58  sub handle_end {  sub handle_end {
59    my $tag = shift;    my $tag = shift;
60    
61    return unless $text{$tag};    return unless $is_text{$tag};
62    print "<" x $open, $tag,  "\n" if $debug;    print "<" x $open, $tag,  "\n" if $debug;
63    $open--;    $open--;
64    $text =~ s/^\s+//;    $text =~ s/^\s+//;
65    $text =~ s/\s+$//;    $text =~ s/\s+$//;
66    $text =~ s/\s+/ /g;    $text =~ s/\s+/ /g;
67    $result{$text{$tag}} .= $text . ' ';    $result{$is_text{$tag}} .= $text . ' ';
68    $text = '';    $text = '';
69  }  }
70    
71    
72  sub handle_text {  sub handle_text {
73    $text .= $_[0] if $open;    my $c = shift;
74      if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) {
75        $open--;
76        return;
77      }
78      $text .= $c if $open;
79  }  }
80    
81  sub my_parse ($) {  sub my_parse ($) {
# Line 80  sub my_parse ($) { Line 91  sub my_parse ($) {
91    
92  sub split {  sub split {
93    my ($self, $doc) = @_;    my ($self, $doc) = @_;
94    my %doc = ( isbn => '', author => '', about => '', colophon => '' );    my %doc = ( isbn => '',
95    my $desc = $doc->{desc};                author => '',
96    my $auth = $doc->{author};                aboutauthor => '',
97    my $colophon = $doc->{colophon};                colophon => '',
98                  abstract => ''
99                );
100    
101    if ($doc->{author}) {    if ($doc->{author}) {
102      %result = ();      %result = ();
# Line 91  sub split { Line 104  sub split {
104      $open = 0;      $open = 0;
105      my_parse($doc->{author});      my_parse($doc->{author});
106      $doc{author} = $result{title};      $doc{author} = $result{title};
107      $doc{author} =~ s/^By\s+//;      $doc{aboutauthor}  = $result{text};
     $doc{about}  = $result{text};  
108    }    }
109    if ($doc->{index}) {    if ($doc->{index}) {
110      $doc->{index} =~ /ISBN\s*([^<]+)/ and $doc{isbn} = $1;      $doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1;
111        %result = ();
112        $text = '';
113        $open = 0;
114        my_parse($doc->{index});
115        $doc{abstract} = $result{text};
116    }    }
117    if ($doc->{colophon}) {    if ($doc->{colophon}) {
118      %result = ();      %result = ();
# Line 110  sub split { Line 127  sub split {
127    
128    my_parse($doc->{desc});    my_parse($doc->{desc});
129    
130    $doc{text}  = $result{text};    $doc{desc}  = $result{text};
131    $doc{title} = $result{title};    $doc{title} = $result{title};
132    
133    while (my($k,$v) = each %doc) {    while (my($k,$v) = each %doc) {
# Line 118  sub split { Line 135  sub split {
135      $doc{$k} = $utf8v;      $doc{$k} = $utf8v;
136    }    }
137    
138      $doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//;
139      $doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /;
140      $doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /;
141      # warn "desc[$doc{desc}]";
142      # warn "abstract[$doc{abstract}]"; # zu viel, zu viel!
143    
144    return \%doc;    return \%doc;
145  }  }
146    

Legend:
Removed from v.71  
changed lines
  Added in v.72

  ViewVC Help
Powered by ViewVC 1.1.26