###########################################################################
#
# solrbuilder.pm -- perl wrapper for building index with Solr
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################


package solrbuilder;

use strict; 
no strict 'refs';

use lucenebuilder;
use solrserver;

sub BEGIN {
    @solrbuilder::ISA = ('lucenebuilder');
}


sub new {
    my $class = shift(@_);
    my $self = new lucenebuilder (@_);
    $self = bless $self, $class;

    $self->{'buildtype'} = "solr";

    my $solr_passes_script = "solr_passes.pl";

    $self->{'solr_passes'} = "$solr_passes_script";
    # Tack perl on the beginning to ensure execution
    $self->{'solr_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$solr_passes_script\"";
    return $self;
}


sub default_buildproc {
    my $self  = shift (@_);

    return "solrbuildproc";
}


# This writes a nice version of the text docs
#
# Essentially the same as the lucenebuilder.pm version, only using solr_passes
# => refactor and make better use of inheritence
#
sub compress_text
{
    my $self = shift (@_);
    # do nothing if we don't want compressed text
    return if $self->{'no_text'};

    my ($textindex) = @_;

    # workaround to avoid hard-coding "solr" check into buildcol.pl
    $textindex =~ s/^section://; 

    my $outhandle = $self->{'outhandle'};

    # the text directory
    my $text_dir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
    my $build_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"");
    &FileUtils::makeAllDirectories($text_dir);

    my $osextra = "";
    if ($ENV{'GSDLOS'} =~ /^windows$/i)
    {
	$text_dir =~ s@/@\\@g;
    }
    else
    {
	if ($outhandle ne "STDERR")
	{
	    # so solr_passes doesn't print to stderr if we redirect output
	    $osextra .= " 2>/dev/null";
	}
    }

    # Find the perl script to call to run solr
    my $solr_passes = $self->{'solr_passes'};
    my $solr_passes_exe = $self->{'solr_passes_exe'};

    my $solr_passes_sections = "Doc";

    my ($handle);

    if ($self->{'debug'})
    {
	$handle = *STDOUT;
    }
    else
    {
	my $site        = $self->{'site'};
	my $collection     = $self->{'collection'};	
	my $core_prefix = (defined $site) ? "$site-$collection" : $collection;
	my $core        = $core_prefix; # unused in this call to solr_passes

	$core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold

        print STDERR "Executable:    $solr_passes_exe\n";
        print STDERR "Sections:      $solr_passes_sections\n";
        print STDERR "Build Dir:     $build_dir\n";
        print STDERR "Cmd:           $solr_passes_exe $core text \"$build_dir\"  $osextra\n";
	if (!open($handle, "| $solr_passes_exe $core text \"$build_dir\"   $osextra"))
	{
	    print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
	    die "solrbuilder::build_index - couldn't run $solr_passes_exe\n$!\n";
	}
    }

    # stored text is always Doc and Sec levels    
    my $levels = { 'document' => 1, 'section' => 1 };
    # always do database at section level
    my $db_level = "section"; 

    # set up the document processr
    $self->{'buildproc'}->set_output_handle ($handle);
    $self->{'buildproc'}->set_mode ('text');
    $self->{'buildproc'}->set_index ($textindex);
    $self->{'buildproc'}->set_indexing_text (0);
    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    $self->{'buildproc'}->set_levels ($levels);
    $self->{'buildproc'}->set_db_level ($db_level);
    $self->{'buildproc'}->reset();

    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
		   $self->{'buildproc'}, $self->{'maxdocs'});
    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
		   "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
    &plugin::end($self->{'pluginfo'});

    close ($handle) unless $self->{'debug'};
    $self->print_stats();

    print STDERR "</Stage>\n" if $self->{'gli'};
}

#----



sub filter_in_out_file
{
    my ($in_filename,$out_filename,$replace_rules) = @_;

    if (open(SIN,"<$in_filename")) {

	if (open(SOUT,">$out_filename")) {

	    my $line;
	    while (defined ($line=<SIN>)) {
		chomp $line;

		my $done_insert = 0;
		foreach my $rule (@$replace_rules) {
		    my $line_re = $rule->{'regexp'};
		    my $insert  = $rule->{'insert'};

		    if ($line =~ m/$line_re/) {
			print SOUT $insert;
			$done_insert = 1;
			last;
		    }
		}
		if (!$done_insert) {
		    print SOUT "$line\n";;
		}
	    }

	    close(SOUT);
	}
	else {
	    print STDERR "Error: Failed to open $out_filename\n";
	    print STDERR "       $!\n";
	}

	close(SIN);
    }
    else {
	print STDERR "Error: Failed to open $in_filename\n";
	print STDERR "       $!\n";
    }

}

# We need to push the list of indexfield to shortname mappings through to the
# build_cfg as, unlike in MGPP, we need these mappings in advance to configure
# Lucene/Solr. Unfortunately the original function found in mgbuilder.pm makes
# a mess of this - it only outputs fields that have been processed (none have)
# and it has a hardcoded renaming for 'text' so it becomes 'TX' according to
# the schema but 'TE' according to XML sent to lucene_passes.pl/solr_passes.pl
# This version is dumber - just copy them all across verbatim - but works. We
# do still need to support the special case of 'allfields'
sub make_final_field_list
{
  my $self = shift (@_);
  $self->{'build_cfg'} = {};
  my @indexfieldmap = ();
  my @indexfields = ();

  # so we don't add duplicates - now that we have subcollections working, there will be multiple index defs with the same fields (and different subcolls)
  my $done_fields = {};
  # @todo support: $self->{'buildproc'}->{'extraindexfields'}
  foreach my $origfields (@{$self->{'collect_cfg'}->{'indexes'}})
  {
      # remove subcoll stuff for finding fields, but we need to leave it in the oroginal index definition for later, so make a copy
      my $fields = $origfields;
    $fields =~ s/:.*$//;
    foreach my $field (split(';', $fields))
    {
	next if (defined $done_fields->{$field});
      my $shortname = 'ERROR';
      if ($field eq 'allfields')
      {
        $shortname = 'ZZ';
      }
      elsif (defined $self->{'buildproc'}->{'indexfieldmap'}->{$field})
      {
        $shortname = $self->{'buildproc'}->{'indexfieldmap'}->{$field};
      }
      else
      {
        print STDERR 'Error! Couldn\'t find indexfieldmap for field: ' . $field . "\n";
      }
      push (@indexfieldmap, $field . '->' . $shortname);
      push (@indexfields, $field);
	$done_fields->{$field} = 1;
    }
  }

  if (scalar @indexfieldmap)
  {
    $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap;
  }

  if (scalar @indexfields)
  {
    $self->{'build_cfg'}->{'indexfields'} = \@indexfields;
  }
}


# Generate solr schema.xml file based on indexmapfield and other associated
# config files 
#
# Unlike make_auxiliary_files(), this needs to be done up-front (rather
# than at the end) so the data-types in schema.xml are correctly set up
# prior to document content being pumped through solr_passes.pl


sub premake_solr_auxiliary_files 
{
    my $self = shift (@_);
 
    # Replace the following marker: 
    #
    #   <!-- ##GREENSTONE-FIELDS## -->
    #
    # with lines of the form:
    #
    #   <field name="<field>" type="string" ... /> 
    #
    # for each <field> in 'indexfieldmap'

    my $schema_insert_xml = "";

    foreach my $ifm (@{$self->{'build_cfg'}->{'indexfieldmap'}}) {
        my ($fullfieldname, $field) = ($ifm =~ m/^(.*)->(.*)$/);
        
        $schema_insert_xml .= "    "; # indent
        $schema_insert_xml .= "<field name=\"$field\" ";
        
        if($field eq "CD" || $field eq "CS") {
            # Coordinate and CoordShort meta should not be split but treated as a whole string for searching. So type=string, not type=text_en_splitting			
            # Can't set to type="location", which uses solr.LatLonType, since type=location fields "must not be multivalued" as per conf/schema.xml.in.
            # And we can have multiple Coordinate (and multiple CoordShort) meta for one doc, so multivalued=true.
            # Not certain what to set stored to. As per conf/schema.xml.in, stored=false means "you only need to search on the field but
            # don't need to return the original value". And they advice to set stored="false" for all fields possible (esp large fields)."
            # But stored=false makes it not visible in Luke. So setting stored=true as for other fields
            # TermVector: '"A term vector is a list of the document's terms and their number of occurrences in that documented."
            # Each document has one term vector which is a list.' (http://makble.com/what-is-term-vector-in-lucene and lucene API for Field.TermVector)
            # e.g. docA contains, "cat" 5 times, "dog" 10 times. We don't care to treat Coordinate meta as a term: not a "term" occurring
            # in the doc, and don't care how often a Coordinate occurs in a document.
            # Consequently, we don't care about term positions and term offsets for Coordinate meta either.
            
            $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n";
        }
		
        elsif($field eq "ML") { 
            # mapLabel: same attributes as for coord meta CD and CS above
            # mapLabel is also like facets with type="string" to not get tokenized, and multiValued="true" to allow each shape's label to be stored distinctly
            $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n";
        }
        
        else {
            if($field eq "LT" || $field eq "LO") # full Latitude and Longitude coordinate meta, not the short variants (LatShort/LA and LongShort/LN)
            {
                # Latitude and Longitude is being phased out in favour of using Coord meta.
                # However, if ever returning to using Lat and Lng instead of Coord meta, then the way the Lat Lng meta is currently written out for type="location"
                # is in the wrong format. Lat and Lng shouldn't get written out separately but as: Lat,Lng
                # It gets written out in solrbuildproc.pm, I think, so that would be where it needs to be corrected.
                # For more info on type=location for our solr 4.7.2 or thereabouts, see https://web.archive.org/web/20160312154250/https://wiki.apache.org/solr/SpatialSearchDev
                # which states:
                #    When indexing, the format is something like:
                #       <field name="store_lat_lon">12.34,-123.45</field>
                #
                $schema_insert_xml .=   "type=\"location\" ";				
            }
            
            
            #		elsif ($field ne "ZZ" && $field ne "TX")
            #		{
            #			$schema_insert_xml .=   "type=\"string\" ";
            #		}
            else
            {
                #$schema_insert_xml .= "type=\"text_en_splitting\" ";
                
                # original default solr field type for all fields is text_en_splitting
                my $solrfieldtype = "text_en_splitting";
                if(defined $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}) {	
                    $solrfieldtype = $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'};
                    #print STDERR "@@@@#### found TYPE: $solrfieldtype\n";
                }
                $schema_insert_xml .= "type=\"$solrfieldtype\" ";
                
            }
            # set termVectors=\"true\" when term vectors info is required, 
            # see TermsResponse termResponse = solrResponse.getTermsResponse(); 
            $schema_insert_xml .=  "indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"true\" termPositions=\"true\" termOffsets=\"true\" />\n";
        }
    }

    # just the one rule to date
    my $insert_rules 
	= [ { 'regexp' => "^\\s*<!--\\s*##GREENSTONE-FIELDS##\\s*-->\\s*\$",
	      'insert' => $schema_insert_xml } ];

    my $site = $self->{'site'};
    my $collection = $self->{'collection'};
    my $core_names = $self->{'solrfullcorenames'};
    
    my $solr_src_home = $ENV{'GEXT_SOLR9'};
##    my $in_dirname = &FileUtils::filenameConcatenate($solr_src_home,"etc","conf");
    my $in_dirname = &FileUtils::filenameConcatenate($solr_src_home,"gs3-default-conf");
    my $schema_in_filename = &FileUtils::filenameConcatenate($in_dirname,"schema.xml.in");

    my @in_file_list = ( "solrconfig.xml", "synonyms.txt", "protwords.txt", "stopwords.txt");
    my @in_dir_list = ( "lang" );
    
    #my $collect_home = $ENV{'GSDLCOLLECTDIR'};
    #my $out_dirname = &FileUtils::filenameConcatenate($collect_home,"etc","conf");
    my $solr_data_home = $ENV{'SOLR_HOME'};
    my $collection_out_dirname = &FileUtils::filenameConcatenate($solr_data_home, "cores", $site, $collection);

    foreach my $core (@$core_names) {
        my $out_dirname = &FileUtils::filenameConcatenate($collection_out_dirname, $core, "conf");
        print STDERR "out dir for core $core = $out_dirname\n";
        &FileUtils::makeAllDirectories($out_dirname);
        
        my $schema_out_filename = &FileUtils::filenameConcatenate($out_dirname,"schema.xml");
    
    # make sure output conf directory exists
    #if (!&FileUtils::directoryExists($out_dirname)) {
 #	&FileUtils::makeDirectory($out_dirname);
  #  }

        &filter_in_out_file($schema_in_filename,$schema_out_filename,$insert_rules);

        # now do the same for solrconfig.xml, stopwords, ...
        # these are simpler, as they currently do not need any filtering
        
        
        foreach my $file ( @in_file_list ) {
            my $in_filename = &FileUtils::filenameConcatenate($in_dirname,$file);
            my $out_filename = &FileUtils::filenameConcatenate($out_dirname,$file);
            
            if(&FileUtils::fileExists($in_filename)) {
                &filter_in_out_file($in_filename,$out_filename,[]);
            }
        }
        
        my @in_dir_list = ( "lang" );
        foreach my $dir ( @in_dir_list ) {
            
            my $full_subdir_name = &FileUtils::filenameConcatenate($in_dirname,$dir);
            
            if(&FileUtils::directoryExists($full_subdir_name)) {
                &FileUtils::copyFilesRecursiveNoSVN($full_subdir_name, $out_dirname);
            }
        }

        # this was in pre_build_indexes. I don't think we need it anymore, but if we did, it should go here.
        #my $full_index_dir = &FileUtils::filenameConcatenate($build_dir,$index_dir);
        #&FileUtils::removeFilesRecursive($full_index_dir);
        #&FileUtils::makeDirectory($full_index_dir);
        
        #my $full_tlog_dir = &FileUtils::filenameConcatenate($full_index_dir, "tlog");
        #&FileUtils::makeDirectory($full_tlog_dir);

    } # foreach $core
}


sub pre_build_indexes
{
    my $self = shift (@_);
    my ($indexname) = @_;
    my $outhandle = $self->{'outhandle'};

    # If the Solr server is not already running, the following starts
    # it up, and only returns when the server is "reading and listening"

    # build_dir is only passed in for the 'text' phase
    my $solr_server = new solrserver();
    $solr_server->start();
    $self->{'solr_server'} = $solr_server;

    my $indexes = [];
    if (defined $indexname && $indexname =~ /\w/) {
	push @$indexes, $indexname;
    } else {
	$indexes = $self->{'collect_cfg'}->{'indexes'};
    }

    # create the mapping between the index descriptions 
    # and their directory names (includes subcolls and langs)
    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);


    # skip para-level check, as this is done in the main 'build_indexes' 
    # routine

    my $all_metadata_specified = 0; # has the user added a 'metadata' index?
    my $allfields_index = 0;        # do we have an allfields index?

    # Using a hashmap here would avoid duplications, but while more space
    # efficient, it's not entirely clear it would be more computationally
    # efficient
    my @all_fields = ();

    foreach my $index (@$indexes) {
	if ($self->want_built($index)) {

	    # get the parameters for the output
	    # split on : just in case there is subcoll and lang stuff
	    my ($fields) = split (/:/, $index);

	    foreach my $field (split (/;/, $fields)) {
		if ($field eq "metadata") {
		    $all_metadata_specified = 1;
		}
		else {
		    push(@all_fields,$field);
		}
	    }
	}
    }

    if ($all_metadata_specified) {

	# (Unforunately) we need to process all the documents in the collection
	# to figure out what the metadata_field_mapping is	    

	# set up the document processr
	$self->{'buildproc'}->set_output_handle (undef);
	$self->{'buildproc'}->set_mode ('index_field_mapping');
	$self->{'buildproc'}->reset();
	
	&plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
		       $self->{'buildproc'}, $self->{'maxdocs'});
	&plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
		       "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
	&plugin::end($self->{'pluginfo'});
	
    }

    else {
	# Field mapping solely dependent of entries in 'indexes'

	# No need to explicitly handle "allfields" as create_shortname()
	# will get a fix on it through it's static_indexfield_map

	my $buildproc = $self->{'buildproc'};
	
      foreach my $field (@all_fields)
      {
        if (!defined $buildproc->{'indexfieldmap'}->{$field})
        {
	    my $shortname = $buildproc->get_or_create_shortname($field);
          $buildproc->{'indexfieldmap'}->{$field} = $shortname;
          $buildproc->{'indexfieldmap'}->{$shortname} = 1;
        }
      }
    }

    # Write out solr 'schema.xml' (and related) file 
    #
    $self->make_final_field_list();

    #$self->premake_solr_auxiliary_files();

    # Now update the solr-core information in solr.xml
    # => at most two cores <colname>-Doc and <colname>-Sec

    my $site        = $self->{'site'};
    my $collection     = $self->{'collection'};
    my $core_prefix = (defined $site) ? "$site-$collection" : $collection;
    my $build_dir = $self->{'build_dir'};

    $self->{'solrcores'} = [];
    # force_removeold == opposite of being run in 'incremental' mode
    my $force_removeold = ($self->{'incremental'}) ? 0 : 1;

    # generate core names
    foreach my $index (@$indexes) {
	next unless ($self->want_built($index));

        my $idx = $self->{'index_mapping'}->{$index};
	    
        foreach my $level (keys %{$self->{'levels'}}) {
            
            my ($pindex) = $level =~ /^(.)/;
            
            my $index_dir = $pindex.$idx;
            my $core = "$core_prefix-$index_dir";
            # solrcores has didx, sidx
            push (@{$self->{'solrcores'}}, $index_dir);
            
            if ($force_removeold) {
                print $outhandle "\n-removeold set (new index will be created)\n";
                
                # create cores under temporary core names, corresponding to building directory
                # whatif we have set building to index??
                $core = "building-".$core; 
                
            }
            push (@{$self->{'solrfullcorenames'}}, $core);   
        }
    }
    $self->premake_solr_auxiliary_files();
    
    foreach my $core (@{$self->{'solrfullcorenames'}}) {
        if ($force_removeold) {
            
            
            # now go on and create new index
            print $outhandle "Creating Solr core: $core\n";
            $solr_server->admin_create_core($core, $site, $collection );
            
        }
        else {
            # if collect==core is already in solr.xml (check with STATUS)
            # => use RELOAD* call to refresh fields now expressed in schema.xml
            # 
            # else 
            # => use CREATE API to add to solr.xml
            #
            # No longer calling RELOAD, because Georgy documented a memory leak with it (svn r32178)
            # Using unload + create to get the same effect as RELOAD without its side-effects.
            # 
            #TODO check if this is still true? can we use reload? what exactly are we doing here??
            my $check_core_exists = $solr_server->admin_ping_core($core);
            
            if ($check_core_exists) {	    
                print $outhandle "Unloading Solr core: $core\n";
                $solr_server->admin_unload_core($core);
            }
            
            print $outhandle "Creating Solr core: $core\n";
            $solr_server->admin_create_core($core, $site, $collection);
        }
    } #foreach core
            
}

# Essentially the same as the lucenebuilder.pm version, only using solr_passes
# => refactor and make better use of inheritence

sub build_index {
    my $self = shift (@_);
    my ($index,$llevel) = @_;
    my $outhandle = $self->{'outhandle'};
    my $build_dir = $self->{'build_dir'};

    # get the full index directory path and make sure it exists
    my $indexdir = $self->{'index_mapping'}->{$index};
    &FileUtils::makeAllDirectories(&FileUtils::filenameConcatenate($build_dir, $indexdir));

    # Find the perl script to call to run solr
    my $solr_passes = $self->{'solr_passes'};
    my $solr_passes_exe = $self->{'solr_passes_exe'};

    # define the section names for solrpasses
    # define the section names and possibly the doc name for solrpasses
    my $solr_passes_sections = $llevel;

    my $osextra = "";
    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	$build_dir =~ s@/@\\@g;
    } else {
	if ($outhandle ne "STDERR") {
	    # so solr_passes doesn't print to stderr if we redirect output
	    $osextra .= " 2>/dev/null";
	}
    }

    # get the index expression if this index belongs
    # to a subcollection
    my $indexexparr = [];
    my $langarr = [];

    # there may be subcollection info, and language info.
    my ($fields, $subcollection, $language) = split (":", $index);
    my @subcollections = ();
    @subcollections = split /,/, $subcollection if (defined $subcollection);

    foreach $subcollection (@subcollections) {
	if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
	    push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
	}
    }

    # add expressions for languages if this index belongs to
    # a language subcollection - only put languages expressions for the
    # ones we want in the index
    my @languages = ();
    my $languagemetadata = "Language";
    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
	$languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
    }
    @languages = split /,/, $language if (defined $language);
    foreach my $language (@languages) {
	my $not=0;
	if ($language =~ s/^\!//) {
	    $not = 1;
	}
	if($not) {
	    push (@$langarr, "!$language");
	} else {
	    push (@$langarr, "$language");
	}
    }

    # Build index dictionary. Uses verbatim stem method
    print $outhandle "\n    creating index dictionary (solr_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
    my ($handle);

    if ($self->{'debug'}) {
	$handle = *STDOUT;
    } else {
	my $site        = $self->{'site'};
	my $collection     = $self->{'collection'};
	my $core_prefix = (defined $site) ? "$site-$collection" : $collection;
	my $ds_idx      = $self->{'index_mapping'}->{$index};
	my $core        = "$core_prefix-$ds_idx";

	$core = "building-".$core unless $self->{'incremental'}; # core points to building only for force_removeold

	print STDERR "Cmd: $solr_passes_exe $core index \"$build_dir\"   $osextra\n";
	if (!open($handle, "| $solr_passes_exe $core index \"$build_dir\"   $osextra")) {
	    print STDERR "<FatalError name='NoRunSolrPasses'/>\n</Stage>\n" if $self->{'gli'};
	    die "solrbuilder::build_index - couldn't run $solr_passes_exe\n!$\n";
	}
    }

    my $store_levels = $self->{'levels'};
    my $db_level = "section"; #always
    my $dom_level = "";
    foreach my $key (keys %$store_levels) {
	if ($mgppbuilder::level_map{$key} eq $llevel) {
	    $dom_level = $key;
	}
    }
    if ($dom_level eq "") {
	print STDERR "Warning: unrecognized tag level $llevel\n";
	$dom_level = "document";
    }

    my $local_levels = { $dom_level => 1 }; # work on one level at a time

    # set up the document processr
    $self->{'buildproc'}->set_output_handle ($handle);
    $self->{'buildproc'}->set_mode ('text');
    $self->{'buildproc'}->set_index ($index, $indexexparr);
    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
    $self->{'buildproc'}->set_indexing_text (1);
    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    $self->{'buildproc'}->set_levels ($local_levels);
    if (defined $self->{'collect_cfg'}->{'sortfields'}) {
	$self->{'buildproc'}->set_sortfields ($self->{'collect_cfg'}->{'sortfields'});
    }
    if (defined $self->{'collect_cfg'}->{'facetfields'}) {
	$self->{'buildproc'}->set_facetfields ($self->{'collect_cfg'}->{'facetfields'});
    }
    $self->{'buildproc'}->set_db_level($db_level);
    $self->{'buildproc'}->reset();

    print $handle "<update>\n";

    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
		   "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
	
    print $handle "</update>\n";

    close ($handle) unless $self->{'debug'};

    $self->print_stats();

    $self->{'buildproc'}->set_levels ($store_levels);
    print STDERR "</Stage>\n" if $self->{'gli'};
    

    ##print STDERR "@@@@@ FINISHED PROCESSING INDEX: indexlevel $self->{'index_mapping'}->{$index}\n\n";

}


sub post_build_indexes {
    my $self = shift(@_);

    # deliberately override to prevent the mgpp post_build_index() calling
    #  $self->make_final_field_list()
    # as this has been done in our pre_build_indexes() phase for solr

      
    my $solr_server = $self->{'solr_server'};

    # 1 Aug 2019: now we unload (remove) building-cores for each index during buildcol itself
    # instead of during activate.
    # Kathy described a problem that when calling buildcol.pl successively without succeeding each
    # with a call to activate, there were Win file lock issues when attempting to manually remove
    # the building folder. This was what activate was to solve, however, there's no reason to
    # call activate after buildcol in such cases where it is known the buildcol failed in some way.
    # (In such cases, the user building the collection would have to manually unload the building-
    # cores through the solr servlet interface).
    # Dr Bainbridge instructed that the building- cores should be unloaded again at the end
    # of buildcol. And any symmetrical step during pre-building, if any is found necessary.
    # I'm still not sure this won't break activate in some way, for some combination,
    # as that is meant to ensure building-cores exist whenever the building folder exists...
    # But I was asked not to take to long on this, so I can't test all the different combinations
    # (removeold/incremental/..., or even remote GS situation) in which building can happen and in
    # which buildcol can be combined or not with activate, or be sequenced with further calls to
    # buildcol, with or without -activate.
    # So to compensate, I've tried to keep the code changes as conservative as possible to keep the chances
    #for things going wrong to a minimum, by pinging for building-* cores before unloading them here
    # in solrbuilder.pm (note that unload doesn't delete the index directory associated with the core)
    # and then in activate.pl the building-* cores get pinged again to determine whether they exist
    # before attempting to unload them there as well, since I can no longer assume the cores exist
    # and can be unloaded. There is now the additional overhead of all the extra pinging going on,
    # but it helps ensure we only unload building-* cores when they exist.

    # Note that pre-build-indexes() was already creating the building- cores, so don't need to
    # worry about the needed symmetry at start and end of buildcol to create building- cores
    # in symmetry with unloading them here.

    # update: now that we can do subcollections with solr, we don't know what cores may have been there
    # lets just removes all building cores for the collection.
    my $site        = $self->{'site'};
    my $collection     = $self->{'collection'};
    my $core_prefix = (defined $site) ? "$site-$collection" : $collection;    
    my $build_dir = $self->{'build_dir'};
    
    $solr_server->admin_unload_all_cores_for_prefix("building-$core_prefix");

    # Also need to stop the Solr server (be it tomcat or jetty) if it was explicitly started
    # in pre_build_indexes()

    if ($solr_server->explicitly_started()) {
	$solr_server->stop();
    }

    $self->{'solr_server'} = undef;

}    

sub build_cfg_extra {
    my $self = shift (@_);
    my ($build_cfg) = @_;

    $self->lucenebuilder::build_cfg_extra($build_cfg);

    # need to add in facet stuff
    my @facetfields = ();
    my @facetfieldmap = ();

    foreach my $sf (@{$self->{'buildproc'}->{'facetfields'}}) {
	if ($self->{'buildproc'}->{'actualfacetfields'}->{$sf}) {
	    my $shortname = $self->{'buildproc'}->{'facetfieldnamemap'}->{$sf};
	    push(@facetfields, $shortname);
	    push (@facetfieldmap, "$sf\-\>$shortname");
	}
	
    }
    $build_cfg->{'indexfacetfields'} = \@facetfields;
    $build_cfg->{'indexfacetfieldmap'} = \@facetfieldmap;
    # store the core names in buildConfig, so that activate.pl can use them,
    $build_cfg->{'solrcores'} = $self->{'solrcores'};

}

#####################################
#The following functions are deliberately not object member functions.
#We call them from activate.pl and don't want to instantiate the builder + buildproc etc

sub get_collection_core_directory {
    my ($site, $collection) = @_;

    return &FileUtils::filenameConcatenate($ENV{'GSDL3DATAHOME'}, "ext", "solr9", "cores", $site, $collection);
}

# need incremental arg??
sub pre_activate {
    my ($buildcfg, $site, $collection) = @_;
    print STDERR "*****in solr preactiavte\n";


    my $core_basename = "$site-$collection";
	
    # If the Solr server is not already running, the following starts
    # it up, and only returns when the server is "ready and listening"	
    my $solr_server = new solrserver();
    my $server_running = $solr_server->server_running();
    #    $solr_server->start();
    
#    $gsserver->print_task_msg("unloading all solr cores associated with $core_basename & building-$core_basename");

    if ($server_running) {
        $solr_server->admin_unload_all_cores_for_prefix($core_basename);
        $solr_server->admin_unload_all_cores_for_prefix("building-$core_basename");
    }
#    if ($solr_server->explicitly_started()) {
#	$solr_server->stop();
#    }

    my $core_basename = "$site-$collection";
    my @corenames = @{$buildcfg->{'solrcores'}};

    my %valid_cores;
    foreach my $corename (@corenames) {
        $valid_cores{"building-$core_basename-$corename"} = 1;
        print STDERR "adding to valid building-$core_basename-$corename\n";
    }
    # look through the data folder, and rename any folder with building- in front
    my $data_dir = &get_collection_core_directory($site, $collection);
    # only want directories
    my $options = { 'strict' => 1, 'exclude_files' => 1 };
    my ($ret_val, $contents) = &FileUtils::_readdirWithOptions($data_dir, $options);
    if (scalar(@$contents) == 0) {
        print STDERR "no cores found in $data_dir\n";
        return;
    }

    foreach my $dir (@$contents) {
        print STDERR "found dir $dir\n";
        if (! ($valid_cores{$dir} == 1)) {
            my $full_dir = &FileUtils::filenameConcatenate($data_dir, $dir);
            print STDERR "delete dir $full_dir\n";
            
            &FileUtils::removeFilesRecursive($full_dir);
            if (-e $full_dir) {
                print STDERR "Couldn't remove all of directory $full_dir\n";
                # shoukld this stop the build??
            }
            #if (&FileUtils::isDirectoryEmpty($full_dir)) {
            #    rmdir($full_dir);
            #}
        }
    }
    foreach my $corename (@corenames) {
        my $new_dir = &FileUtils::filenameConcatenate($data_dir, "$core_basename-$corename");
        my $old_dir = &FileUtils::filenameConcatenate($data_dir, "building-$core_basename-$corename");
        print STDERR "renaming $old_dir => $new_dir\n";
        &FileUtils::renameDirectory($old_dir, $new_dir);
        if (!$server_running) {
            &generate_core_properties($new_dir, "$core_basename-$corename");
        }
    }
}

sub generate_core_properties {
    my ($dir, $corename) = @_;

    my $text = "name=$corename\n";
    my $prop_file = &FileUtils::filenameConcatenate($dir, "core.properties");

    &FileUtils::writeUTF8File($prop_file, \$text);
}

# at soem stage we need to delete cores that don't match building-correname
#then rename all those buildng ones
sub post_activate {
    my ($buildcfg, $site, $collection) = @_;

    # If the Solr server is not already running, the following starts
    # it up, and only returns when the server is "ready and listening"	
    my $solr_server = new solrserver();
   # $solr_server->start();
    if ($solr_server->server_running()) {
        # Call CREATE action to get the old cores pointing to the index folder
        #  -- any building or index cores have been unloaded already
        #  -- load up the new one
        my $core_basename = "$site-$collection";
        my @corenames = @{$buildcfg->{'solrcores'}};
        
        foreach my $corename (@corenames) {
            print STDERR "activating core $core_basename-$corename\n";
            # should be corename, site collection
            $solr_server->admin_create_core("$core_basename-$corename", $site, $collection);
        }
    }


#    if ($solr_server->explicitly_started()) {
#	$solr_server->stop();
#    }

}

# this is called if a collections indexes were solr originallly, but have now
# changed to something else

sub cleanup_after_buildtype_change {
    my ($site, $collection, $index_dir) = @_;
    my $solr_server = new solrserver();
    $solr_server->start();

    
    print STDERR "in solr cleanup\n";
    print STDERR "unload cores\n";
    $solr_server->admin_unload_all_cores_for_prefix("building-$site-$collection");
    $solr_server->admin_unload_all_cores_for_prefix("$site-$collection");

    if ($solr_server->explicitly_started()) {
	$solr_server->stop();
    }
    print STDERR "delete indexes\n";
    my $data_dir = &get_collection_core_directory($site, $collection);

    if (&FileUtils::directoryExists($data_dir)) {
        # delete this directory
        print STDERR "deleting files in $data_dir\n";
        &FileUtils::removeFilesRecursive($data_dir);
    }
}
    
1;


