###########################################################################
#
# arcinfo.pm --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################


# This module stores information about the archives. At the moment
# this information just consists of the file name (relative to the
# directory the archives information file is in) and its OID.

# This module assumes there is a one to one correspondance between
# a file in the archives directory and an OID.

package arcinfo;

use constant ORDER_OID_INDEX  => 0;
use constant ORDER_SORT_INDEX => 1;

use constant INFO_FILE_INDEX    => 0;
use constant INFO_STATUS_INDEX  => 1;

use constant INFO_GROUPPOS_INDEX  => 3;
use strict;

use dbutil;


# File format read in: OID <tab> Filename <tab> Optional-Index-Status

# Index status can be:
#  I = Index for the first time
#  R = Reindex
#  D = Delete
#  B = Been indexed

sub new {
    my $class = shift(@_);
    my $infodbtype = shift(@_);

    # If the infodbtype wasn't passed in, use the default from dbutil
    if (!defined($infodbtype))
    {
      $infodbtype = &dbutil::get_default_infodb_type();
    }

    my $self = {'infodbtype' => $infodbtype,
		'timestamp_file' => "archiveinf-timestamp.out",
		'info'        => {},
		'reverse-info'=> {},
		'order'       => [],
		'reverse_sort'=> 0,
		'sort'        => 0};

    return bless $self, $class;
}

sub _load_info_txt 
{
    my $self = shift (@_);
    my ($filename) = @_;

    if (defined $filename && &FileUtils::fileExists($filename)) {
	open (INFILE, $filename) || 
	    die "arcinfo::load_info couldn't read $filename\n";

	my ($line, @line);
	while (defined ($line = <INFILE>)) {
	    $line =~ s/\cM|\cJ//g; # remove end-of-line characters
	    @line = split ("\t", $line); # filename, 
	    if (scalar(@line) >= 2) {
		$self->add_info (@line);
	    }
	}
	close (INFILE);
    }
}

sub _load_info_db
{
    my $self = shift (@_);
    my ($filename) = @_;

    my $infodb_map = {};

    &dbutil::read_infodb_file($self->{'infodbtype'}, $filename, $infodb_map);

    foreach my $oid ( keys %$infodb_map ) {
	my $vals = $infodb_map->{$oid};
	# interested in doc-file and index-status

	my ($doc_file) = ($vals=~/^<doc-file>(.*)$/m);
	my ($index_status) = ($vals=~/^<index-status>(.*)$/m);
	my ($sortmeta) = ($vals=~/^<sort-meta>(.*)$/m);
	my ($group_position) = ($vals=~/^<group-position>(.*)$/m);
	$self->add_info ($oid,$doc_file,$index_status,$sortmeta, $group_position);
    }
}


sub load_info {
    my $self = shift (@_);
    my ($filename) = @_;

    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
    
    $self->{'info'} = {};

    if (defined $filename) {
	
	my $timestamp_filename = $self->get_timestamp_filename($filename);
	if (&FileUtils::fileExists($timestamp_filename)) {
	    my ($prev_infodbtype,$unused_timestamp) = $self->load_timestamp($filename);
	    $self->{'prev-infodbtype'} = $prev_infodbtype;
	}

	if (!&FileUtils::fileExists($filename)) {
	    # Typically a sign of a first-time, or -removeold build

	    # ... but could be because the value of infodbtype has changed since the last build
	    # => Check

	    my $prev_infodbtype = $self->{'prev-infodbtype'};
	    my $infodbtype = $self->{'infodbtype'};
	    
	    if (defined $prev_infodbtype && ($prev_infodbtype ne $infodbtype)) {
		print STDERR "arcinfo::load_info() detected change in infodbtype from previous build: $prev_infodbtype -> $infodbtype\n";
		print STDERR "=> adjusting archiveinf filename accordingly.\n";

		$filename =~ s/\.$infodbtype$/.$prev_infodbtype/;
	    }
	}

	# $filename might have changed if infodbtypes don't match
	if (&FileUtils::fileExists($filename)) {
	    if ($filename =~ m/\.inf$/) {
		$self->_load_info_txt($filename);
	    }
	    else {
		$self->_load_info_db($filename);
	    }
	}
    }    
}

sub _load_filelist_db
{
    my $self = shift (@_);
    my ($filename) = @_;

    my $infodb_map = {};

    &dbutil::read_infodb_file($self->{'infodbtype'}, $filename, $infodb_map);

    foreach my $file ( keys %$infodb_map ) {
	# turn placeholders in the file keys of archiveinf-src file back to absolute paths
	$file = &util::placeholders_to_abspath($file);
	$self->{'prev_import_filelist'}->{$file} = 1;
    }
}


sub load_prev_import_filelist {
    my $self = shift (@_);
    my ($filename) = @_;

    $self->{'import-filelist'} = {};

    if ((defined $filename) && &FileUtils::fileExists($filename)) {
	    
	if ($filename =~ m/\.inf$/) {
	    # e.g. 'archives-src.inf' (which includes complete list of file
	    # from last time import.pl was run)
	    $self->_load_info_txt($filename);
	}
	else {
	    $self->_load_filelist_db($filename);
	}
    }
}

# Loads in reverse-lookup info:
#   Key   = a file in import
#   Value = all the the doc ids that file is used in
#
sub load_rev_info
{
    my $self = shift (@_);
    my ($rev_filename) = @_;

    my $rev_infodb_map = {};

    if ((defined $rev_filename) && &FileUtils::fileExists($rev_filename)) {
	&dbutil::read_infodb_file($self->{'infodbtype'}, $rev_filename, $rev_infodb_map);

	foreach my $srcfile ( keys %$rev_infodb_map ) {
	    
	    my $vals = $rev_infodb_map->{$srcfile};
	    
	    $srcfile = &util::placeholders_to_abspath($srcfile);
	    foreach my $OID ($vals =~ m/^<oid>(.*)$/gm) {
		$self->add_reverseinfo($srcfile,$OID);
	    }
	}
    }
}


sub _save_info_txt {
    my $self = shift (@_);
    my ($filename) = @_;

    my ($OID, $info);

    open (OUTFILE, ">$filename") || 
	die "arcinfo::save_info couldn't write $filename\n";
  
    foreach $info (@{$self->get_OID_list()}) {
	if (defined $info) {
	    print OUTFILE join("\t", @$info), "\n";
	}
    }
    close (OUTFILE);
}

sub _save_info_db {
    my $self = shift (@_);
    my ($filename) = @_;

    my $infodbtype = $self->{'infodbtype'};

    # Not the most efficient operation, but will do for now

    # read it in
    my $infodb_map = {};
    &dbutil::read_infodb_file($infodbtype, $filename, $infodb_map);

    # change index-status values
    foreach my $info (@{$self->get_OID_list()}) {
	if (defined $info) {
	    my ($oid,$doc_file,$index_status) = @$info;
	    if (defined $infodb_map->{$oid}) {
		my $vals_ref = \$infodb_map->{$oid};
		$$vals_ref =~ s/^<index-status>(.*)$/<index-status>$index_status/m;
	    }
	    else {
		print STDERR "Warning: $filename does not have key $oid\n";
	    }
	}
    }


    # write out again
    my $infodb_handle = &dbutil::open_infodb_write_handle($infodbtype, $filename);
    foreach my $oid ( keys %$infodb_map ) {
	my $vals = $infodb_map->{$oid};
	&dbutil::write_infodb_rawentry($infodbtype,$infodb_handle,$oid,$vals);
    }
    &dbutil::close_infodb_write_handle($infodbtype, $infodb_handle);

}

sub save_revinfo_db {
    my $self = shift (@_);
    my ($rev_filename) = @_;
    # Output reverse lookup database

    my $rev_infodb_map = $self->{'reverse-info'};
    my $rev_infodb_handle
	= &dbutil::open_infodb_write_handle($self->{'infodbtype'}, $rev_filename);
#	= &dbutil::open_infodb_write_handle($self->{'infodbtype'}, $rev_filename, "append");

    foreach my $key ( keys %$rev_infodb_map ) {
	my $val_hash = $rev_infodb_map->{$key};
	$key = &util::abspath_to_placeholders($key);	

	&dbutil::write_infodb_entry($self->{'infodbtype'}, $rev_infodb_handle, $key, $val_hash);
    }
    &dbutil::close_infodb_write_handle($self->{'infodbtype'}, $rev_infodb_handle);

}

sub save_info {
    my $self = shift (@_);
    my ($arcinfo_filename) = @_;

    
    if ($arcinfo_filename =~ m/(contents)|(\.inf)$/) {
	$self->_save_info_txt($arcinfo_filename);
    }
    else {
	$self->_save_info_db($arcinfo_filename);
    }
}



sub get_timestamp_filename
{
    my $self = shift (@_);
    my ($arcinfo_doc_filename) = @_;

    my $arcinfo_dirname = &File::Basename::dirname($arcinfo_doc_filename);
    
    my $timestamp_filename = &FileUtils::filenameConcatenate($arcinfo_dirname, $self->{'timestamp_file'});

    return $timestamp_filename;
}


sub load_timestamp
{
    my $self = shift (@_);
    my ($arcinfo_doc_filename) = @_;

    my $arcinfo_timestamp_filename = $self->get_timestamp_filename($arcinfo_doc_filename);
    
    my $timestamp_infodbtype = undef;
    my $arcinfo_timestamp    = undef;
    
    if (-f $arcinfo_timestamp_filename) {
	if (open(ARCINFTS_IN,"<$arcinfo_timestamp_filename")) {
	    my $line = <ARCINFTS_IN>;
	    chomp $line;
	    close(ARCINFTS_IN);

	    ($timestamp_infodbtype,$arcinfo_timestamp) = ($line =~ m/^\s*(\w+)\s*:\s*(\d+)\s*$/)
	}
	else {
	    print STDERR "Error - failed to read:\n";
	    print STDERR "    $arcinfo_timestamp_filename\n";
	}
    }
    
    return ($timestamp_infodbtype,$arcinfo_timestamp);
}


sub save_arcinfo_doc_timestamp
{
    my $self = shift (@_);
    my ($arcinfo_doc_filename) = @_;
    
    my $arcinfo_timestamp_filename = $self->get_timestamp_filename($arcinfo_doc_filename);

    my $infodbtype = $self->{'infodbtype'};

    # Before recording the current infodbtype type and timestamp
    # handle the case of prev_infodbtype being different to infodbyte
    # => mothball the prev_infodbtype as a backup file
    
    my $prev_infodbtype = $self->{'prev-infodbtype'};
    if ((defined $prev_infodbtype) && ($prev_infodbtype ne $infodbtype)) {
	my ($arcinfo_tailname, $arcinfo_dirname, $arcinfo_suffix)
	    = &File::Basename::fileparse($arcinfo_doc_filename, "\\.[^\\.]+\$");

	my $prev_arcinfo_filename   = &dbutil::get_infodb_file_path($prev_infodbtype,$arcinfo_tailname,$arcinfo_dirname);	
	my $backup_arcinfo_filename = $prev_arcinfo_filename;
	$backup_arcinfo_filename =~ s/\.$prev_infodbtype$/.bak.$prev_infodbtype/;

	print STDERR "As a result of infodbtype changing from $prev_infodbtype -> $infodbtype\n";	
	print STDERR "=> Making a backup copy of $prev_arcinfo_filename\n";
	&dbutil::rename_db_file_to($prev_infodbtype,$prev_arcinfo_filename,$backup_arcinfo_filename);

    }

    # Could consider taking the the timestamp of the file itself ???
    #   e.g. $arcinfo_timestamp = -M $arcinfo_filename;
    # But for now base this on a 'live' reading 
    my $arcinfo_timestamp = time(); 
    
    if (open(ARCINFTS_OUT,">$arcinfo_timestamp_filename")) {
	print ARCINFTS_OUT "$infodbtype:$arcinfo_timestamp\n";
	
	close(ARCINFTS_OUT);
    }
    else {
	print STDERR "Error - failed to write:\n";
	print STDERR "    $arcinfo_timestamp_filename\n";
    }
}


sub delete_info {
    my $self = shift (@_);
    my ($OID) = @_;

    if (defined $self->{'info'}->{$OID}) {
	delete $self->{'info'}->{$OID};
	
	my $i = 0;
	while ($i < scalar (@{$self->{'order'}})) {
	    if ($self->{'order'}->[$i]->[ORDER_OID_INDEX] eq $OID) {
		splice (@{$self->{'order'}}, $i, 1);
		last;
	    }
	    
	    $i ++;
	}
    }
}

sub add_info {
    my $self = shift (@_);
    my ($OID, $doc_file, $index_status, $sortmeta, $group_position) = @_;
    $sortmeta = "" unless defined $sortmeta;
    $index_status = "I" unless defined $index_status; # I = needs indexing
    if (! defined($OID)) {
	# only happens when no files can be processed?
	return undef;
    }

    if (defined $self->{'info'}->{$OID}) {
	# test to see if we are in a reindex situation

	my $existing_status_info = $self->get_status_info($OID);

	if ($existing_status_info eq "D") {
	    # yes, we're in a reindexing situation
	    $self->delete_info ($OID);


	    # force setting to "reindex"
	    $index_status = "R";

	}
	else {
	    # some other, possibly erroneous, situation has arisen
	    # where the document already seems to exist
	    print STDERR "Warning: $OID already exists with index status $existing_status_info\n";
	    print STDERR "         Deleting previous version stored in archiveinfo\n";

	    $self->delete_info ($OID);
	}
    }

    $self->{'info'}->{$OID} = [$doc_file,$index_status,$sortmeta, $group_position];
    push (@{$self->{'order'}}, [$OID, $sortmeta]); # ORDER_OID_INDEX and ORDER_SORT_INDEX


}

sub set_status_info {
    my $self = shift (@_);
    my ($OID, $index_status) = @_;

    my $OID_info = $self->{'info'}->{$OID};
    $OID_info->[INFO_STATUS_INDEX] = $index_status;
}


sub get_status_info {
    my $self = shift (@_);
    my ($OID) = @_;

    my $index_status = undef;

    my $OID_info = $self->{'info'}->{$OID};
    if (defined $OID_info) {
	$index_status = $OID_info->[INFO_STATUS_INDEX];
    }
    else {
	die "Unable to find document id $OID\n";
    }

    return $index_status;

}

sub get_group_position {
    my $self = shift (@_);
    my ($OID) = @_;

    my $group_position = undef;
    my $OID_info = $self->{'info'}->{$OID};
    if (defined $OID_info) {
	$group_position = $OID_info->[INFO_GROUPPOS_INDEX];
    }
    else {
	die "Unable to find document id $OID\n";
    }
    return $group_position;
	
}
sub add_reverseinfo {
    my $self = shift (@_);
    my ($key, $OID) = @_;

    my $existing_key = $self->{'reverse-info'}->{$key};
    if (!defined $existing_key) {
	$existing_key = {};
	$self->{'reverse-info'}->{$key} = $existing_key;
    }

    my $existing_oid = $existing_key->{'oid'};	
    if (!defined $existing_oid) {
	$existing_oid = [];
	$existing_key->{'oid'} = $existing_oid;
    }

    for (@$existing_oid) {
	if ($_ eq $OID) {
	    return; # already in the list
	}
    }
    push(@$existing_oid,$OID);
    
}

sub remove_reverseinfo {
     my $self = shift (@_);
     my ($key, $OID) = @_;

     my $existing_key = $self->{'reverse-info'}->{$key};
     if (!defined $existing_key) {
	 ###print STDERR "trying to remove entry for $key, but its not there!\n";
	 return;
     }
     if (!defined $OID) {
	 ###print STDERR "no oid defined, removing whole entry\n";
	 delete $self->{'reverse-info'}->{$key};
	 return;
     }
     my $existing_oid = $existing_key->{'oid'};
     if (!defined $existing_oid) {
	 ###print STDERR "trying to remove entry for $key, but it has no oid field!\n";
	 return;
     }
     for my $i (0..scalar(@$existing_oid)) {
	 if (@$existing_oid[$i] eq $OID) {
	     splice @$existing_oid, $i, 1;
	     if (scalar (@$existing_oid) ==0) {
		 ###print STDERRQ "have removed all oids, delete the main key\n";
		 delete $self->{'reverse-info'}->{$key};
	     }
	     return;
	 }
     }
}
sub get_reverseinfo {
    my $self = shift (@_);
    my ($key) = @_;

    if ($self->{'reverse-info'}->{$key}) {
	return $self->{'reverse-info'}->{$key}->{'oid'};
    }
    return undef;
}

sub set_meta_file_flag {
    my $self = shift (@_);
    my ($key) = @_;

    my $existing_key = $self->{'reverse-info'}->{$key};
    if (!defined $existing_key) {
	$existing_key = {};
	$self->{'reverse-info'}->{$key} = $existing_key;
    }

    $existing_key->{'meta-file'} = ["1"];

}
sub reverse_sort 
{
    my $self = shift(@_);
    $self->{'reverse_sort'} = 1;
}
sub sort 
{
    my $self = shift(@_);
    $self->{'sort'} = 1;
}


# returns a list of the form [[OID, doc_file, index_status], ...]
sub get_OID_list 
{
    my $self = shift (@_);

    my $order = $self->{'order'};

    my @sorted_order;
    if ($self->{'reverse_sort'}) {
	@sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
    } elsif ($self->{'sort'}) {
	@sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
    } else { # not sorting, don't bother
	@sorted_order = @$order;
    }

    my @list = ();

    foreach my $OID_order (@sorted_order) {
	my $OID = $OID_order->[ORDER_OID_INDEX];
	my $OID_info = $self->{'info'}->{$OID};

	push (@list, [$OID, $OID_info->[INFO_FILE_INDEX], 
		      $OID_info->[INFO_STATUS_INDEX]]);
    }

    return \@list;
}

# returns a list of the form [[doc_file, OID], ...]
sub get_file_list {
    my $self = shift (@_);

    my $order = $self->{'order'};

    my @sorted_order;
    if ($self->{'reverse_sort'}) {
	@sorted_order = sort {$b->[ORDER_SORT_INDEX] cmp $a->[ORDER_SORT_INDEX]} @$order;
    } elsif ($self->{'sort'}) {
	@sorted_order = sort {$a->[ORDER_SORT_INDEX] cmp $b->[ORDER_SORT_INDEX]} @$order;
    } else { # not sorting, don't bother
	@sorted_order = @$order;
    }

    my @list = ();

    foreach my $OID_order (@sorted_order) {
	my $OID = $OID_order->[ORDER_OID_INDEX];
	my $OID_info = $self->{'info'}->{$OID};

	push (@list, [$OID_info->[INFO_FILE_INDEX], $OID]);
    }

    return \@list;
}


# returns a list of the form [doc_file,index_status,$sort_meta, $group_position]
sub get_info {
    my $self = shift (@_);
    my ($OID) = @_;

    if (defined $self->{'info'}->{$OID}) {
	return $self->{'info'}->{$OID};
    }

    return undef;
}



# returns the number of documents so far
sub size {
    my $self = shift (@_);
    return (scalar(@{$self->{'order'}}));
}

1;

