/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 72 - (hide annotations)
Mon Jan 28 21:35:39 2002 UTC (22 years, 3 months ago) by laperla
File size: 3588 byte(s)
- erste Demoinstallation fuer oreilly fertig

1 ulpfr 54 #!/usr/bin/perl
2     # -*- Mode: Perl -*-
3     # $Basename: HTML.pm $
4 laperla 72 # $Revision: 1.6 $
5 ulpfr 54 # Author : Ulrich Pfeifer with Andreas König
6     # Created On : Sat Nov 1 1997
7     # Last Modified By: Ulrich Pfeifer
8 ulpfr 61 # Last Modified On: Fri Jan 4 16:06:14 2002
9 ulpfr 54 # Language : CPerl
10 ulpfr 61 # Update Count : 14
11 ulpfr 54 # Status : Unknown, Use with caution!
12     #
13     # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14     #
15     #
16    
17     package WAIT::Parse::Ora;
18 ulpfr 58 use HTML::Parser;
19 laperla 69 use Encode;
20 ulpfr 58 use strict;
21 ulpfr 54 use vars qw(@ISA);
22     @ISA = qw(WAIT::Parse::Base);
23    
24 ulpfr 58 my $debug = 0;
25 laperla 72 my %is_text = (
26 ulpfr 58 p => 'text',
27     # h1 => 'text',
28     # h2 => 'text',
29     # h3 => 'text',
30     title => 'title',
31     );
32 ulpfr 54
33 ulpfr 58 my $p = HTML::Parser->new(
34     api_version => 3,
35     start_h => [\&handle_start, "tagname, attr"],
36     end_h => [\&handle_end, "tagname"],
37     text_h => [\&handle_text, "dtext"],
38     marked_sections => 1,
39     );
40     my %result;
41     my $text;
42     my $open;
43    
44     sub handle_start {
45     my $tag = shift;
46 laperla 72 my $attr = shift;
47 ulpfr 58
48 laperla 72 return unless
49     $is_text{$tag} # well-formed paragraphs
50     ||
51     $tag eq "h3" # good for desc, author, and colo
52     ||
53     ($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html
54 ulpfr 58 $open++;
55     print ">" x $open, $tag, "\n" if $debug;
56     }
57    
58     sub handle_end {
59     my $tag = shift;
60    
61 laperla 72 return unless $is_text{$tag};
62 ulpfr 58 print "<" x $open, $tag, "\n" if $debug;
63     $open--;
64     $text =~ s/^\s+//;
65     $text =~ s/\s+$//;
66     $text =~ s/\s+/ /g;
67 laperla 72 $result{$is_text{$tag}} .= $text . ' ';
68 ulpfr 58 $text = '';
69     }
70    
71    
72     sub handle_text {
73 laperla 72 my $c = shift;
74     if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) {
75     $open--;
76     return;
77     }
78     $text .= $c if $open;
79 ulpfr 58 }
80    
81 laperla 69 sub my_parse ($) {
82     my($s) = @_;
83     my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
84     # LATIN for entities
85     # and we would get
86     # mixed content in
87     # result
88     $p->parse($ls);
89     $p->eof;
90     }
91    
92 ulpfr 54 sub split {
93     my ($self, $doc) = @_;
94 laperla 72 my %doc = ( isbn => '',
95     author => '',
96     aboutauthor => '',
97     colophon => '',
98     abstract => ''
99     );
100 ulpfr 54
101 ulpfr 61 if ($doc->{author}) {
102     %result = ();
103     $text = '';
104     $open = 0;
105 laperla 69 my_parse($doc->{author});
106 ulpfr 61 $doc{author} = $result{title};
107 laperla 72 $doc{aboutauthor} = $result{text};
108 ulpfr 61 }
109     if ($doc->{index}) {
110 laperla 72 $doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1;
111     %result = ();
112     $text = '';
113     $open = 0;
114     my_parse($doc->{index});
115     $doc{abstract} = $result{text};
116 ulpfr 61 }
117 laperla 65 if ($doc->{colophon}) {
118     %result = ();
119     $text = '';
120     $open = 0;
121 laperla 69 my_parse($doc->{colophon});
122 laperla 65 $doc{colophon} = $result{text};
123     }
124 ulpfr 58 %result = ();
125     $text = '';
126     $open = 0;
127 ulpfr 54
128 laperla 69 my_parse($doc->{desc});
129 ulpfr 54
130 laperla 72 $doc{desc} = $result{text};
131 ulpfr 58 $doc{title} = $result{title};
132    
133 laperla 69 while (my($k,$v) = each %doc) {
134     my $utf8v = Encode::decode("ISO-8859-1",$v);
135     $doc{$k} = $utf8v;
136     }
137    
138 laperla 72 $doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//;
139     $doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /;
140     $doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /;
141     # warn "desc[$doc{desc}]";
142     # warn "abstract[$doc{abstract}]"; # zu viel, zu viel!
143    
144 ulpfr 58 return \%doc;
145 ulpfr 54 }
146 ulpfr 58
147     1;

Properties

Name Value
cvs2svn:cvs-rev 1.6

  ViewVC Help
Powered by ViewVC 1.1.26