###########################################################################
#
# EMAILPlug.pm - a plugin for parsing email files
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
# EMAILPlug
#
# by Gordon Paynter (gwp@cs.waikato.ac.nz)
#
# Email plug reads email files. These are named with a simple
# number (i.e. as they appear in mh_mail folders) or with the
# extension .email
#
# Document text:
# The document text consists of all the text
# after the first blank line in the document.
#
# Metadata:
# $Headers All the header content
# $Subject Subject: header
# $To To: header
# $From From: header - this will be stored as Creator
# $DateText Date: header
# $Date Date: header in GSDL format (eg: 19990924)
#
# Version history
#
# 1.2 (2000 Jun 12) Major rewrite.
# (The new version of Greenstone breaks some of the metadata.)
# 1.1.1 Compensated for two-digit years like "95"
# 1.1 (1999 Sep 20) Introduced the various metadata fileds
# 1.0 Based on the original HTMLPlug code
#
package EMAILPlug;
use BasPlug;
use sorttools;
use util;
# EMAILPlug is a sub-class of BasPlug.
sub BEGIN {
@ISA = ('BasPlug');
}
# Create a new EMAILPlug object with which to parse a file.
# Accomplished by creating a new BasPlug and using bless to
# turn it into an EMAILPlug.
sub new {
my ($class) = @_;
my $self = new BasPlug ("EMAILPlug", @_);
return bless $self, $class;
}
sub get_default_process_exp {
my $self = shift (@_);
return q^\d+(\.email)?$^;
}
# do plugin specific processing of doc_obj
sub process {
my $self = shift (@_);
my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
my $outhandle = $self->{'outhandle'};
# Check that we're dealing with a valid mail file
return undef unless (($$textref =~ /From:/) || ($$textref =~ /To:/));
print $outhandle "EMAILPlug: processing $file\n"
if $self->{'verbosity'} > 1;
my $cursection = $doc_obj->get_top_section();
#
# Parse the document's text and extract metadata
#
# Separate header from body of message
my $Headers = $$textref;
$Headers =~ s/\n\n.*//s;
$$textref = substr $$textref, (length $Headers);
# Extract basic metadata from header
my @headers = ("From", "To", "Subject", "Date");
my $value = "";
my %raw;
foreach my $name (@headers) {
$value = $Headers;
$value =~ s/.*$name://s;
$value =~ s/\S*:.*//s;
$value =~ s/\s*$//;
$value =~ s/\s+/ /g;
$raw{$name} = $value;
}
# Process Date information
if ($raw{"Date"}) {
$raw{"DateText"} = $raw{"Date"};
# Convert the date text to internal date format
$value = $raw{"Date"};
my ($day, $month, $year) = $value =~ /(\d?\d)\s([A-Z][a-z][a-z])\s(\d\d\d?\d?)/;
if ($year < 100) { $year += 1900; }
$raw{"Date"} = &sorttools::format_date($day, $month, $year);
} else {
# We have not extracted a date
$raw{"DateText"} = "Unknown.";
$raw{"Date"} = "19000000";
}
# Add extracted metadata to document object
foreach my $name (keys %raw) {
$value = $raw{$name};
if ($value) {
$value = &text_into_html($value);
} else {
$value = "No $name field";
}
$doc_obj->add_utf8_metadata ($cursection, $name, $value);
}
# Add "All headers" metadata
$Headers = &text_into_html($Headers);
$Headers = "No headers" unless ($Headers =~ /\w/);
$doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
# Add text to document object
$$textref = &text_into_html($$textref);
$$textref = "No message" unless ($$textref =~ /\w/);
$doc_obj->add_utf8_text($cursection, $$textref);
return 1;
}
# Convert a text string into HTML.
#
# The HTML is going to be inserted into a GML file, so
# we have to be careful not to use symbols like ">",
# which ocurs frequently in email messages (and use
# > instead.
#
# This function also turns links and email addresses into hyperlinks,
# and replaces carriage returns with
tags (and multiple carriage
# returns with
tags).
sub text_into_html {
my ($text) = @_;
# Convert problem characters into HTML symbols
$text =~ s/&/&/go;
$text =~ s/</go;
$text =~ s/>/>/go;
$text =~ s/\"/"/go;
# convert email addresses and URLs into links
$text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/$1<\/a>/g;
$text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-~]*)/$1<\/a>/g;
# Clean up whitespace and convert \n charaters to
$text =~ s/ +/ /go;
$text =~ s/\s*$//o;
$text =~ s/^\s*//o;
$text =~ s/\n/\n /go;
return $text;
}
# Perl packages have to return true if they are run.
1;
or
/go;
$text =~ s/
\s*
/