########################################################################### # # mgppbuildproc.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # This document processor outputs a document # for mgpp to process package mgppbuildproc; use basebuildproc; BEGIN { @mgppbuildproc::ISA = ('basebuildproc'); } #this must be the same as in mgppbuilder our %level_map = ('document'=>'Doc', 'section'=>'Sec', 'paragraph'=>'Para'); sub new { my $class = shift @_; my $self = new basebuildproc (@_); # use a different index specification to the default $self->{'index'} = "text"; $self->{'dontindex'} = {}; $self->{'indexfieldmap'} = {}; $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index $self->{'strip_html'}=1; return bless $self, $class; } sub set_indexfieldmap { my $self = shift (@_); my ($indexmap) = @_; $self->{'indexfieldmap'} = $indexmap; } sub get_indexfieldmap { my $self = shift (@_); return $self->{'indexfieldmap'}; } sub set_levels { my $self = shift (@_); my ($levels) = @_; $self->{'levels'} = $levels; } sub set_strip_html { my $self = shift (@_); my ($strip) = @_; $self->{'strip_html'}=$strip; } sub get_gdbm_level { my $self = shift (@_); #if a Section level index is not built, the gdbm file should be at doc #level not Section if ($self->{'levels'}->{'section'}) { return "section"; } return "document"; } #sub find_paragraphs { # $_[1] =~ s/(
$1/gi; #} #this function strips the html tags from the doc if ($strip_html) and # if ($para) replaces
with filter_text ($field, $new_text);
# don't want to do anything for this version, however,
# in a particular collection you might want to override
# this method to post-process certain fields depending on
# the field, or whether we are outputting it for indexing
}
sub text {
my $self = shift (@_);
my ($doc_obj) = @_;
my $handle = $self->{'output_handle'};
my $outhandle = $self->{'outhandle'};
# only output this document if it is one to be indexed
return if ($doc_obj->get_doc_type() ne "indexed_doc");
my $indexed_doc = $self->is_subcollection_doc($doc_obj);
# this is another document
$self->{'num_docs'} += 1;
# get the parameters for the output
# split on : just in case there is subcoll and lang stuff
my ($fields) = split (/:/, $self->{'index'});
my ($documenttag) = "";
my($documentendtag) = "";
if ($self->{'levels'}->{'document'}) {
$documenttag = "\n<". $level_map{'document'} . ">\n";
$documentendtag = "\n". $level_map{'document'} . ">\n";
}
my ($sectiontag) = "";
if ($self->{'levels'}->{'section'}) {
$sectiontag = "\n<". $level_map{'section'} . ">\n";
}
my ($paratag) = "";
if ($self->{'levels'}->{'paragraph'}) {
if ($self->{'strip_html'}) {
$paratag = "<". $level_map{'paragraph'} . ">";
} else {
print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
}
}
my $doc_section = 0; # just for this document
my $text = $documenttag;
# get the text for this document
my $section = $doc_obj->get_top_section();
while (defined $section) {
# update a few statistics
$doc_section++;
$self->{'num_sections'} += 1;
$text .= "$sectiontag";
if ($indexed_doc) {
if ($self->{'indexing_text'}) {
$text .= "$paratag"; # only add para tags for indexing
# note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
}
$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
foreach my $field (split (/,/, $fields)) {
# only deal with this field if it doesn't start with top or
# this is the first section
my $real_field = $field;
if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
my $new_text = "";
my $tmp_text = "";
if ($real_field eq "text") {
if ($self->{'indexing_text'}) { #tag the text with and
have no spaces, and removes all < and > inside
#these tags
sub preprocess_text {
my $self = shift (@_);
my ($text, $strip_html, $para) = @_;
my ($outtext) = "";
if ($strip_html) {
while ($text =~ /<([^>]*)>/ && $text ne "") {
my $tag = $1;
$outtext .= $`." "; #add everything before the matched tag
$text = $'; #'everything after the matched tag
if ($para && $tag =~ /^\s*p\s/i) {
$outtext .= $para;
}
elsif ($tag =~ /^pre$/) { # a pre tag
$text =~ /<\/pre>/; # find the closing pre tag
my $tmp_text = $`; #everything before the closing pre tag
$text = $'; #'everything after the
$tmp_text =~ s/[<>]//g; # remove all < and >
$outtext.= $tmp_text . " ";
}
}
$outtext .= $text; # add any remaining text
return $outtext;
} #if strip_html
#if ($para) {
#$text =~ s/(