/[webpac]/trunk2/lib/WebPAC.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk2/lib/WebPAC.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 421 by dpavlin, Fri Sep 10 22:24:42 2004 UTC revision 439 by dpavlin, Mon Sep 13 23:13:54 2004 UTC
# Line 9  use Config::IniFiles; Line 9  use Config::IniFiles;
9  use XML::Simple;  use XML::Simple;
10  use Template;  use Template;
11  use Log::Log4perl qw(get_logger :levels);  use Log::Log4perl qw(get_logger :levels);
12    use Time::HiRes qw(time);
13    
14  use Data::Dumper;  use Data::Dumper;
15    
# Line 33  Create new instance of WebPAC using conf Line 34  Create new instance of WebPAC using conf
34    
35   my $webpac = new WebPAC(   my $webpac = new WebPAC(
36          config_file => 'name.conf',          config_file => 'name.conf',
37          [code_page => 'ISO-8859-2',]          code_page => 'ISO-8859-2',
38          [low_mem => 1,]          low_mem => 1,
39   );   );
40    
41  Default C<code_page> is C<ISO-8859-2>.  Default C<code_page> is C<ISO-8859-2>.
42    
43    Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
44    
45  This method will also read configuration files  This method will also read configuration files
46  C<global.conf> (used by indexer and Web font-end)  C<global.conf> (used by indexer and Web font-end)
47  and configuration file specified by C<config_file>  and configuration file specified by C<config_file>
48  which describes databases to be indexed.  which describes databases to be indexed.
49    
 C<low_mem> options is double-edged sword. If enabled, WebPAC  
 will run on memory constraint machines (which doesn't have enough  
 physical RAM to create memory structure for whole ISIS database).  
   
 If your machine has 512Mb or more and database is around 10000 records,  
 memory shouldn't be an issue. If you don't have enough physical RAM, you  
 might consider using virtual memory (if your operating system is handling it  
 well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle  
 parsed structure of ISIS database.  
   
 However, when WebPAC is running on desktop machines (or laptops :-), it's  
 highly undesireable for system to start swapping. Using C<low_mem> option can  
 reduce WecPAC memory usage to 16Mb for same database with lookup fields and  
 sorted indexes which stay in RAM. Performance will suffer, but memory usage  
 will really be minimal. It might be also more confortable to run WebPAC reniced  
 on those machines.  
   
50  =cut  =cut
51    
52  # mapping between data type and tag which specify  # mapping between data type and tag which specify
# Line 77  sub new { Line 63  sub new {
63          my $self = {@_};          my $self = {@_};
64          bless($self, $class);          bless($self, $class);
65    
66            $self->{'start_t'} = time();
67    
68          my $log_file = $self->{'log'} || "log.conf";          my $log_file = $self->{'log'} || "log.conf";
69          Log::Log4perl->init($log_file);          Log::Log4perl->init($log_file);
70    
# Line 126  sub new { Line 114  sub new {
114    
115          # running with low_mem flag? well, use DBM::Deep then.          # running with low_mem flag? well, use DBM::Deep then.
116          if ($self->{'low_mem'}) {          if ($self->{'low_mem'}) {
117                  $log->info("running with low_mem which impacts performance (<64 Mb memory usage)");                  $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119                  my $db_file = "data.db";                  my $db_file = "data.db";
120    
# Line 135  sub new { Line 123  sub new {
123                          $log->debug("removed '$db_file' from last run");                          $log->debug("removed '$db_file' from last run");
124                  }                  }
125    
126                  use DBM::Deep;                  require DBM::Deep;
127    
128                  my $db = new DBM::Deep $db_file;                  my $db = new DBM::Deep $db_file;
129    
# Line 144  sub new { Line 132  sub new {
132                  if ($db->error()) {                  if ($db->error()) {
133                          $log->logdie("can't open '$db_file' under low_mem: ",$db->error());                          $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134                  } else {                  } else {
135                          $log->debug("using file $db_file for DBM::Deep");                          $log->debug("using file '$db_file' for DBM::Deep");
136                  }                  }
137    
138                  $self->{'db'} = $db;                  $self->{'db'} = $db;
# Line 160  Open CDS/ISIS database using OpenIsis mo Line 148  Open CDS/ISIS database using OpenIsis mo
148   $webpac->open_isis(   $webpac->open_isis(
149          filename => '/data/ISIS/ISIS',          filename => '/data/ISIS/ISIS',
150          code_page => '852',          code_page => '852',
151          limit_mfn => '500',          limit_mfn => 500,
152            start_mfn => 6000,
153          lookup => [ ... ],          lookup => [ ... ],
154   );   );
155    
156  By default, ISIS code page is assumed to be C<852>.  By default, ISIS code page is assumed to be C<852>.
157    
158    If optional parametar C<start_mfn> is set, this will be first MFN to read
159    from database (so you can skip beginning of your database if you need to).
160    
161  If optional parametar C<limit_mfn> is set, it will read just 500 records  If optional parametar C<limit_mfn> is set, it will read just 500 records
162  from database in example above.  from database in example above.
163    
# Line 193  sub open_isis { Line 185  sub open_isis {
185          $log->logcroak("need filename") if (! $arg->{'filename'});          $log->logcroak("need filename") if (! $arg->{'filename'});
186          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
187    
188            $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
189    
190          # store data in object          # store data in object
191          $self->{'isis_filename'} = $arg->{'filename'};          $self->{'isis_filename'} = $arg->{'filename'};
192          $self->{'isis_code_page'} = $code_page;          $self->{'isis_code_page'} = $code_page;
# Line 210  sub open_isis { Line 204  sub open_isis {
204          my $isis_db = OpenIsis::open($arg->{'filename'});          my $isis_db = OpenIsis::open($arg->{'filename'});
205    
206          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
207            my $startmfn = 1;
208    
209          $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});          if (my $s = $self->{'start_mfn'}) {
210                    $log->info("skipping to MFN $s");
211                    $startmfn = $s;
212            }
213    
214          $log->info("processing $maxmfn records...");          $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
215    
216            $log->info("processing ",($maxmfn-$startmfn)." records...");
217    
218          # read database          # read database
219          for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
220    
221    
222                  $log->debug("mfn: $mfn\n");                  $log->debug("mfn: $mfn\n");
# Line 264  sub open_isis { Line 264  sub open_isis {
264    
265          }          }
266    
267          $self->{'current_mfn'} = 1;          $self->{'current_mfn'} = $startmfn;
268          $self->{'last_pcnt'} = 0;          $self->{'last_pcnt'} = 0;
269    
270          $log->debug("max mfn: $maxmfn");          $log->debug("max mfn: $maxmfn");
# Line 329  sub progress_bar { Line 329  sub progress_bar {
329    
330          $self->{'last_pcnt'} ||= 1;          $self->{'last_pcnt'} ||= 1;
331    
         $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'});  
   
332          my $p = int($curr * 100 / $max);          my $p = int($curr * 100 / $max);
333    
334            # reset on re-run
335            if ($p < $self->{'last_pcnt'}) {
336                    $self->{'last_pcnt'} = $p;
337                    $self->{'last_t'} = time();
338                    $self->{'last_curr'} = 1;
339            }
340    
341          if ($p != $self->{'last_pcnt'}) {          if ($p != $self->{'last_pcnt'}) {
342                  printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p );  
343                    my $last_curr = $self->{'last_curr'} || $curr;
344                    my $t = time();
345                    my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
346                    my $eta = ($max-$curr) / ($rate || 1);
347                    printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
348                  $self->{'last_pcnt'} = $p;                  $self->{'last_pcnt'} = $p;
349                    $self->{'last_t'} = time();
350                    $self->{'last_curr'} = $curr;
351          }          }
352          print STDERR "\n" if ($p == 100);          print STDERR "\n" if ($p == 100);
353  }  }
354    
355    =head2 fmt_time
356    
357    Format time (in seconds) for display.
358    
359     print $webpac->fmt_time(time());
360    
361    This method is called by L<progress_bar> to display remaining time.
362    
363    =cut
364    
365    sub fmt_time {
366            my $self = shift;
367    
368            my $t = shift || 0;
369            my $out = "";
370    
371            my ($ss,$mm,$hh) = gmtime($t);
372            $out .= "${hh}h" if ($hh);
373            $out .= sprintf("%02d:%02d", $mm,$ss);
374            $out .= "  " if ($hh == 0);
375            return $out;
376    }
377    
378  =head2 open_import_xml  =head2 open_import_xml
379    
380  Read file from C<import_xml/> directory and parse it.  Read file from C<import_xml/> directory and parse it.
# Line 774  sub data_structure { Line 810  sub data_structure {
810                          }                          }
811                          next if (! @v);                          next if (! @v);
812    
813                            if ($tag->{'sort'}) {
814                                    # very special sort, ignoring case and
815                                    # html
816                                    @v = sort {
817                                            $a =~ s#<[^>]+/*>##;
818                                            $b =~ s#<[^>]+/*>##;
819                                            lc($b) cmp lc($a)
820                                    } @v;
821                                    $log->warn("sort within tag is usually not what you want!");
822                                    $log->debug("sorted values: ",sub { join(", ",@v) });
823                            }
824    
825                          # use format?                          # use format?
826                          if ($tag->{'format_name'}) {                          if ($tag->{'format_name'}) {
827                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
# Line 788  sub data_structure { Line 836  sub data_structure {
836                                  next; # don't return headline in data_structure!                                  next; # don't return headline in data_structure!
837                          }                          }
838    
839                          # does tag have type?                          # delimiter will join repeatable fields
840                          if ($tag->{'type'}) {                          if ($tag->{'delimiter'}) {
841                                  push @{$row->{$tag->{'type'}}}, @v;                                  @v = ( join($tag->{'delimiter'}, @v) );
842                          } else {                          }
843                                  push @{$row->{'display'}}, @v;  
844                                  push @{$row->{'swish'}}, @v;                          # default types
845                            my @types = qw(display swish);
846                            # override by type attribute
847                            @types = ( $tag->{'type'} ) if ($tag->{'type'});
848    
849                            foreach my $type (@types) {
850                                    # append to previous line?
851                                    $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
852                                    if ($tag->{'append'}) {
853    
854                                            # I will delimit appended part with
855                                            # delimiter (or ,)
856                                            my $d = $tag->{'delimiter'};
857                                            # default delimiter
858                                            $d ||= ", ";
859    
860                                            my $last = pop @{$row->{$type}};
861                                            $d = "" if (! $last);
862                                            $last .= $d . join($d, @v);
863                                            push @{$row->{$type}}, $last;
864    
865                                    } else {
866                                            push @{$row->{$type}}, @v;
867                                    }
868                          }                          }
869    
870    
# Line 806  sub data_structure { Line 877  sub data_structure {
877                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
878                          $row->{'name'} = $name ? $self->_x($name) : $field;                          $row->{'name'} = $name ? $self->_x($name) : $field;
879    
880                            # post-sort all values in field
881                            if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
882                                    $log->warn("sort at field tag not implemented");
883    
884                            }
885    
886                          push @ds, $row;                          push @ds, $row;
887    
888                          $log->debug("row $field: ",sub { Dumper($row) });                          $log->debug("row $field: ",sub { Dumper($row) });
# Line 1021  B<This is different from normal Log4perl Line 1098  B<This is different from normal Log4perl
1098  also use method names, and not only classes (which are just few)  also use method names, and not only classes (which are just few)
1099  to filter logging.  to filter logging.
1100    
1101    
1102    =head1 MEMORY USAGE
1103    
1104    C<low_mem> options is double-edged sword. If enabled, WebPAC
1105    will run on memory constraint machines (which doesn't have enough
1106    physical RAM to create memory structure for whole source database).
1107    
1108    If your machine has 512Mb or more of RAM and database is around 10000 records,
1109    memory shouldn't be an issue. If you don't have enough physical RAM, you
1110    might consider using virtual memory (if your operating system is handling it
1111    well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
1112    parsed structure of ISIS database (this is what C<low_mem> option does).
1113    
1114    Hitting swap at end of reading source database is probably o.k. However,
1115    hitting swap before 90% will dramatically decrease performance and you will
1116    be better off with C<low_mem> and using rest of availble memory for
1117    operating system disk cache (Linux is particuallary good about this).
1118    However, every access to database record will require disk access, so
1119    generation phase will be slower 10-100 times.
1120    
1121    Parsed structures are essential - you just have option to trade RAM memory
1122    (which is fast) for disk space (which is slow). Be sure to have planty of
1123    disk space if you are using C<low_mem> and thus L<DBD::Deep>.
1124    
1125    However, when WebPAC is running on desktop machines (or laptops :-), it's
1126    highly undesireable for system to start swapping. Using C<low_mem> option can
1127    reduce WecPAC memory usage to around 64Mb for same database with lookup
1128    fields and sorted indexes which stay in RAM. Performance will suffer, but
1129    memory usage will really be minimal. It might be also more confortable to
1130    run WebPAC reniced on those machines.
1131    
1132  =cut  =cut
1133    
1134  1;  1;

Legend:
Removed from v.421  
changed lines
  Added in v.439

  ViewVC Help
Powered by ViewVC 1.1.26