/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 72 - (show annotations)
Mon Jan 28 21:35:39 2002 UTC (22 years, 3 months ago) by laperla
File size: 3588 byte(s)
- erste Demoinstallation fuer oreilly fertig

1 #!/usr/bin/perl
2 # -*- Mode: Perl -*-
3 # $Basename: HTML.pm $
4 # $Revision: 1.6 $
5 # Author : Ulrich Pfeifer with Andreas König
6 # Created On : Sat Nov 1 1997
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Fri Jan 4 16:06:14 2002
9 # Language : CPerl
10 # Update Count : 14
11 # Status : Unknown, Use with caution!
12 #
13 # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14 #
15 #
16
17 package WAIT::Parse::Ora;
18 use HTML::Parser;
19 use Encode;
20 use strict;
21 use vars qw(@ISA);
22 @ISA = qw(WAIT::Parse::Base);
23
24 my $debug = 0;
25 my %is_text = (
26 p => 'text',
27 # h1 => 'text',
28 # h2 => 'text',
29 # h3 => 'text',
30 title => 'title',
31 );
32
33 my $p = HTML::Parser->new(
34 api_version => 3,
35 start_h => [\&handle_start, "tagname, attr"],
36 end_h => [\&handle_end, "tagname"],
37 text_h => [\&handle_text, "dtext"],
38 marked_sections => 1,
39 );
40 my %result;
41 my $text;
42 my $open;
43
44 sub handle_start {
45 my $tag = shift;
46 my $attr = shift;
47
48 return unless
49 $is_text{$tag} # well-formed paragraphs
50 ||
51 $tag eq "h3" # good for desc, author, and colo
52 ||
53 ($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html
54 $open++;
55 print ">" x $open, $tag, "\n" if $debug;
56 }
57
58 sub handle_end {
59 my $tag = shift;
60
61 return unless $is_text{$tag};
62 print "<" x $open, $tag, "\n" if $debug;
63 $open--;
64 $text =~ s/^\s+//;
65 $text =~ s/\s+$//;
66 $text =~ s/\s+/ /g;
67 $result{$is_text{$tag}} .= $text . ' ';
68 $text = '';
69 }
70
71
72 sub handle_text {
73 my $c = shift;
74 if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) {
75 $open--;
76 return;
77 }
78 $text .= $c if $open;
79 }
80
81 sub my_parse ($) {
82 my($s) = @_;
83 my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
84 # LATIN for entities
85 # and we would get
86 # mixed content in
87 # result
88 $p->parse($ls);
89 $p->eof;
90 }
91
92 sub split {
93 my ($self, $doc) = @_;
94 my %doc = ( isbn => '',
95 author => '',
96 aboutauthor => '',
97 colophon => '',
98 abstract => ''
99 );
100
101 if ($doc->{author}) {
102 %result = ();
103 $text = '';
104 $open = 0;
105 my_parse($doc->{author});
106 $doc{author} = $result{title};
107 $doc{aboutauthor} = $result{text};
108 }
109 if ($doc->{index}) {
110 $doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1;
111 %result = ();
112 $text = '';
113 $open = 0;
114 my_parse($doc->{index});
115 $doc{abstract} = $result{text};
116 }
117 if ($doc->{colophon}) {
118 %result = ();
119 $text = '';
120 $open = 0;
121 my_parse($doc->{colophon});
122 $doc{colophon} = $result{text};
123 }
124 %result = ();
125 $text = '';
126 $open = 0;
127
128 my_parse($doc->{desc});
129
130 $doc{desc} = $result{text};
131 $doc{title} = $result{title};
132
133 while (my($k,$v) = each %doc) {
134 my $utf8v = Encode::decode("ISO-8859-1",$v);
135 $doc{$k} = $utf8v;
136 }
137
138 $doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//;
139 $doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /;
140 $doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /;
141 # warn "desc[$doc{desc}]";
142 # warn "abstract[$doc{abstract}]"; # zu viel, zu viel!
143
144 return \%doc;
145 }
146
147 1;

Properties

Name Value
cvs2svn:cvs-rev 1.6

  ViewVC Help
Powered by ViewVC 1.1.26