/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 57 by ulpfr, Mon Dec 31 14:05:55 2001 UTC revision 58 by ulpfr, Fri Jan 4 14:51:09 2002 UTC
# Line 1  Line 1 
1  #!/usr/bin/perl  #!/usr/bin/perl
2  #                              -*- Mode: Perl -*-  #                              -*- Mode: Perl -*-
3  # $Basename: HTML.pm $  # $Basename: HTML.pm $
4  # $Revision: 1.1 $  # $Revision: 1.2 $
5  # Author          : Ulrich Pfeifer with Andreas König  # Author          : Ulrich Pfeifer with Andreas König
6  # Created On      : Sat Nov 1 1997  # Created On      : Sat Nov 1 1997
7  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
8  # Last Modified On: Mon Dec 31 14:51:55 2001  # Last Modified On: Fri Jan  4 15:44:34 2002
9  # Language        : CPerl  # Language        : CPerl
10  # Update Count    : 7  # Update Count    : 9
11  # Status          : Unknown, Use with caution!  # Status          : Unknown, Use with caution!
12  #  #
13  # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.  # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
# Line 15  Line 15 
15  #  #
16    
17  package WAIT::Parse::Ora;  package WAIT::Parse::Ora;
18    use HTML::Parser;
19    use strict;
20  use vars qw(@ISA);  use vars qw(@ISA);
 require HTML::Parse;  
 require HTML::FormatText;  
 use HTML::Entities qw(decode_entities);  
21  @ISA = qw(WAIT::Parse::Base);  @ISA = qw(WAIT::Parse::Base);
22    
23    my $debug = 0;
24    my %text = (
25                p     => 'text',
26    #            h1    => 'text',
27    #            h2    => 'text',
28    #            h3    => 'text',
29                title => 'title',
30               );
31    
32    my $p = HTML::Parser->new(
33                              api_version => 3,
34                              start_h => [\&handle_start, "tagname, attr"],
35                              end_h   => [\&handle_end,   "tagname"],
36                              text_h  => [\&handle_text,  "dtext"],
37                              marked_sections => 1,
38                             );
39    my %result;
40    my $text;
41    my $open;
42    
43    sub handle_start {
44      my $tag = shift;
45    
46      return unless $text{$tag};
47      $open++;
48      print ">" x $open, $tag,  "\n" if $debug;
49    }
50    
51    sub handle_end {
52      my $tag = shift;
53    
54      return unless $text{$tag};
55      print "<" x $open, $tag,  "\n" if $debug;
56      $open--;
57      $text =~ s/^\s+//;
58      $text =~ s/\s+$//;
59      $text =~ s/\s+/ /g;
60      $result{$text{$tag}} .= $text . ' ';
61      $text = '';
62    }
63    
64    
65    sub handle_text {
66      $text .= $_[0] if $open;
67    }
68    
69  sub split {  sub split {
70    my ($self, $doc) = @_;    my ($self, $doc) = @_;
71      my %doc;
72    my $desc = $doc->{desc};    my $desc = $doc->{desc};
73    my $auth = $doc->{author};    my $auth = $doc->{author};
   my ($title)  = $desc =~ /<title\s*>(.*?)<\/title\s*>/si;  
   my ($author) = $auth =~ /<title\s*>(.*?)<\/title\s*>/si;  
   my $html = HTML::Parse::parse_html($desc);  
   my $formatter = HTML::FormatText->new;  
   
   {  
    'text',  $formatter->format($html),  
    'title', $formatter->format(HTML::Parse::parse_html($title)),  
    'author', $formatter->format(HTML::Parse::parse_html($author)),  
  };  
 }  
74    
75  sub tag {    %result = ();
76    my ($self, $doc) = @_;    $text = '';
77      $open = 0;
78    
79      $p->parse($doc->{author});
80      $p->eof;
81      $doc{author} = $result{title};
82      $doc{about}  = $result{text};
83    
84      %result = ();
85      $text = '';
86      $open = 0;
87    
88    my $html_source = $doc->{desc};    $p->parse($doc->{desc});
89    $html_source =~ tr/\r/\n/;    $p->eof;
90    
91    my ($pre,$title,$body)    $doc{text}  = $result{text};
92        = $html_source =~ /^(.*?<title\s*>)(.*?)(<\/title\s*>.+)/si;    $doc{title} = $result{title};
93    
94    (    return \%doc;
    {'text'  => 1},  decode_entities($pre),  
    {'title' => 1},  decode_entities($title),  
    {'text'  => 1},  decode_entities($body),  
   );  
95  }  }
96    
97    1;

Legend:
Removed from v.57  
changed lines
  Added in v.58

  ViewVC Help
Powered by ViewVC 1.1.26