#!/usr/bin/perl -w

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

use warnings;

use Encode;
use JSON;

# use LWP;

use OAuth::Lite::Consumer;
use OAuth::Lite::AuthMethod;

use URI::Escape;

sub _data_api
{
    my ($mode,$htid,$opt_seq,$opt_params) = @_;

    my $access_key = '7e6ee38bae';                   
    my $secret_key = 'e0429c0394385486249b4a230702'; 

    my $request_url = "http://babel.hathitrust.org/cgi/htd/$mode/$htid";

    $request_url .= "/$opt_seq" if (defined $opt_seq);

    my $consumer = OAuth::Lite::Consumer->new( 'consumer_key' => $access_key,
					       'consumer_secret' => $secret_key,
					       'auth_method' => OAuth::Lite::AuthMethod::URL_QUERY );

    my $response = $consumer->request( 'method' => 'GET',
				       'url' => $request_url,
				       'params' => $opt_params );
    
    if (!$response->is_success()) {
	print STDERR "**** Failed to retrieval any content from URL:\n";
	print STDERR "         ", $consumer->oauth_request->uri, "\n";
	print "------\n";
	print STDERR "**** Status:  ", $response->status_line, "\n";
	print "------\n";
	my $text_only_content = $response->content();
	$text_only_content =~ s/<[^>]*>//g;
	$text_only_content =~ s/^\s*$//mg;

	print STDERR "**** Content: $text_only_content\n";
	print "------\n";
	
	$response = undef;
    }

    return $response;
}


sub pageimage_data_api
{
    my ($htid,$seq_num,$ofilename) = @_;

    if (!-f $ofilename) {
	print STDERR "Downloading PageImage $htid/$seq_num\n";

	my $retryCount = 0;
      PageImageRetry:
	my $response = _data_api("pageimage",$htid, $seq_num );
	if (defined $response) {
	    $retryCount = 0; # reset it
	    my $content = $response->content();

	    if (open(IMGOUT,">$ofilename")) {       
		binmode(IMGOUT);	   
		print IMGOUT $content;
		close(IMGOUT);
	    }
	    else {
		print STDERR "Error: Failed to open $ofilename for binary output\n";
		print STDERR "       $!\n";
	    }    
	}
	else {
	    $retryCount++;
	    print STDERR "Failed to download PageImage\n";

	    if ($retryCount<2) {
		print STDERR "Sleeping to 60 seconds\n";
		sleep(60);
		print STDERR "Retry attempt $retryCount\n";
		goto PageImageRetry;
	    }
	    else {
		print STDERR "Maximum number of attempts reached.  Stopping.\n";
		exit -1;
	    }
	}
	
    }
    else {
	print STDERR "Skipping PageImage data API request\n";
	print STDERR "=> downloaded file $ofilename already exists\n";
    }
}
    


sub pageocr_data_api
{
    my ($htid,$seq_num,$ofilename) = @_;

    my $content = undef;

    if (((defined $ofilename) && (!-f $ofilename))
	|| (!defined $ofilename)) {
	print STDERR "Downloading PageOCR (text) $htid/$seq_num\n";

	my $retryCount = 0;
      PageOcrRetry:

	my $response = _data_api("pageocr",$htid, $seq_num );

	if (defined $response) {
	    $retryCount = 0; # reset it

	    $content = $response->content();
	    
	    if (open(TXTOUT,">$ofilename")) {       
		print TXTOUT $content;
		close(TXTOUT);
	    }
	    else {
		print STDERR "Error: Failed to open $ofilename for binary output\n";
		print STDERR "       $!\n";
	    }    
	}
	else {
	    $retryCount++;
	    print STDERR "Failed to download PageOCR\n";

	    if ($retryCount<2) {
		print STDERR "Sleeping to 60 seconds\n";
		sleep(60);
		print STDERR "Retry attempt $retryCount\n";
		goto PageOcrRetry;
	    }
	    else {
		print STDERR "Maximum number of attempts reached.  Stopping.\n";
		exit -1;
	    }
	}
	
    }
    else {
	print STDERR "Skipping PageOCR Data API request\n";
	print STDERR "=> Using cached version of file:\n    $ofilename\n";
	
	if (open(JSIN,"<$ofilename")) {
	    binmode(JSIN,":utf8");

	    my $line;
	    while (defined ($line=<JSIN>)) {
		$content .= $line;
	    }
	    close(JSIN);
	}
	else {
	    print STDERR "Error: Failed to open cached file $ofilename for input\n";
	    print STDERR "       $!\n";
	}
    }

    return $content;
}
    
sub json_structure_data_api
{
    my ($htid,$ofilename) = @_;

    my $json_content = "";

    if (!-f $ofilename) {
	print STDERR "Downloading METS structure record for $htid\n";

	my $response = _data_api("structure",$htid, undef, {'alt' => "json"} );
	$json_content = $response->content();

	if (open(JSOUT,">$ofilename")) {
	    binmode(JSOUT,":utf8");
	    print JSOUT $json_content;
	    close(JSOUT);
	}
	else {
	    print STDERR "Error: Failed to open $ofilename for output\n";
	    print STDERR "       $!\n";
	}
	
    }
    else {
	print STDERR "Skipping Structure Data API request\n";
	print STDERR "=> Using cached version of JSON structure file:\n    $ofilename\n";

	if (open(JSIN,"<$ofilename")) {
	    binmode(JSIN,":utf8");

	    my $line;
	    while (defined ($line=<JSIN>)) {
		$json_content .= $line;
	    }
	    close(JSIN);
	}
	else {
	    print STDERR "Error: Failed to open cached JSON file $ofilename for input\n";
	    print STDERR "       $!\n";
	}
    }

##    print "**** $json_content\n";

    my $json_content_utf8 = Encode::encode("utf8",$json_content);
    my $json_data = decode_json $json_content_utf8;

    return $json_data;


}


# Example file

#<PagedDocument>
#  <Metadata name="Title">Matariki 1881</Metadata>
#  <Metadata name="Date">18810423</Metadata>
#  <Metadata name="Number">1</Metadata>
#  <PageGroup>
#    <Metadata name="Title">Supplementary Material</Metadata>
#    <Page txtfile="abstracts/23__1abstract.txt">
#      <Metadata name="Title">Abstract</Metadata>
#    </Page>
#  </PageGroup>
#  <PageGroup>
#    <Metadata name="Title">Newspaper pages</Metadata>
#    <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
#    <Page pagenum="2" imgfile="images/23__1_2.gif" txtfile="text/23__1_2.txt"/>
#    <Page pagenum="3" imgfile="images/23__1_3.gif" txtfile="text/23__1_3.txt"/>
#  </PageGroup>
#</PagedDocument>

sub rec_paged_image_structure
{
    my ($this_div,$pagenum,$depth,$htid,$file_id_map,$resource_output_dir) = @_;

    my ($local_output_dir) = ($resource_output_dir =~ m/^.*\/(.*?)$/);


    my $fptr_entry = $this_div->{'METS:fptr'};

    if (defined $this_div->{'METS:div'}) {
	# Only want Greenstones <PageGroup> tag if not a METS leaf div
	print PIOUT "  " x $depth, "<PageGroup>\n";
    }
    
    if (defined $fptr_entry) {
	# hit a leaf node

	my $fptr_array = undef;

	if (ref $fptr_entry eq "HASH") {
	    $fptr_array = [ $fptr_entry ];
	}
	else  {
	    $fptr_array = $fptr_entry;
	}

	my $imgfile = undef;
	my $txtfile = undef;


	foreach my $fptr_hash (@$fptr_array) {
	    my $fileid = $fptr_hash->{'FILEID'};

##	    print STDERR "Looking up fileid = $fileid\n";

	    my $file = $file_id_map->{$fileid};
	    my $seq  = $file->{'SEQ'};
	    my $href = $file->{'METS:FLocat'}->{'xlink:href'};


	    if ($file->{'USE'} =~ m/\bimage\b/i) {		
		$imgfile = "$local_output_dir/$href";
		my $full_imgfile = "$resource_output_dir/$href";
		pageimage_data_api($htid,$seq,$full_imgfile);
	    }
	    elsif ($file->{'USE'} =~ m/\bocr\b/i) {
		$txtfile = "$local_output_dir/$href";
		my $full_txtfile = "$resource_output_dir/$href";
		pageocr_data_api($htid,$seq,$full_txtfile);
	    }

	}
	# Generate line along the following lines

	#  <Page pagenum="1" imgfile="images/23__1_1.gif" txtfile="text/23__1_1.txt"/>
	print PIOUT "  " x ($depth+1), "<Page ";
	print PIOUT "pagenum=\"$pagenum\" " if defined $pagenum;
	print PIOUT "imgfile=\"$imgfile\" " if defined $imgfile;
	print PIOUT "txtfile=\"$txtfile\" " if defined $txtfile;
	print PIOUT "/>\n";
	
    }
        
    # Now process any child divs

    my $div_entry = $this_div->{'METS:div'};

    if (defined $div_entry) {

	my $div_array = undef;

	if (ref $div_entry eq "HASH") {
	    # upgrade single entry to array
	    $div_array = [ $div_entry ];
	}
	else {
	    $div_array = $div_entry;
	}

	print STDERR "+ Processing ", scalar(@$div_array), " sections\n";

	foreach my $div_hash (@$div_array) {

	    my $pagenum = $div_hash->{'ORDER'};
	    
	    rec_paged_image_structure($div_hash,$pagenum,$depth+1,$htid,$file_id_map,$resource_output_dir);
	}
    }

    if (defined $this_div->{'METS:div'}) {
	# Only want Greenstones <PageGroup> tag if not a METS leaf div
	print PIOUT "  " x $depth, "</PageGroup>\n";
    }


}

sub generate_paged_image_structure
{
    my ($toplevel_div,$htid,$file_id_map,$ofilename) = @_;
    
    print STDERR "Generating PageImage file: $ofilename\n";

    my ($resource_output_dir) = ($ofilename =~ m/^(.*)\..+?$/);
    if (!-d $resource_output_dir) {
	mkdir $resource_output_dir;
    }
    
    if (open(PIOUT,">$ofilename")) {
	binmode(PIOUT,":utf8");
	
	print PIOUT "<PagedDocument>\n";
#	print PIOUT "  <PageGroup>\n";

	rec_paged_image_structure($toplevel_div,1,1,$htid,$file_id_map,$resource_output_dir);

#	print PIOUT "  </PageGroup>\n";
	print PIOUT "</PagedDocument>\n";

	close(PIOUT);
    }
    else {
	print STDERR "Error: Failed to open $ofilename for output\n";
	print STDERR "       $!\n";
    }



}


my $pdCount = 0;

sub download_ht_doc
{
    my ($cat_key,$htid,$ofilename) = @_;

    my $json_data = json_structure_data_api($htid,$ofilename);

    # Map in the IDs from:
    # METS:mets->METS:fileSec->METS:fileGrp

    my $file_sec_ids = {};
    
    my $file_grp_array = $json_data->{'METS:mets'}->{'METS:fileSec'}->{'METS:fileGrp'};

#    print "**** num file grps = ", scalar(@$file_grp_array), "\n";

    foreach my $file_grp (@$file_grp_array) {

	my $use = $file_grp->{'USE'};

	my $file_entry = $file_grp->{'METS:file'};

	my $file_array = undef;

	if (ref $file_entry eq "HASH") {
	    # upgrade single entry into array
	    $file_array = [ $file_entry ];
	}
	else {
	    $file_array = $file_entry;
	}

#	    print "**** num files = ", scalar(@$file_array), "\n";

	foreach my $file_hash (@$file_array) {
	    # push file_grp USE attribute down into each file entry (to make file easier later on)
	    $file_hash->{'USE'} = $use;

	    my $file_id = $file_hash->{'ID'};
	    $file_sec_ids->{$file_id} = $file_hash;
	    
#	    print "file id = $file_id\n";
	}

    }

    # METS:mets->METS:structMap->{nested METS:div}+

    my $struct_map_array = $json_data->{'METS:mets'}->{'METS:structMap'};
    my $toplevel_div = $struct_map_array->{'METS:div'};

    my $pi_filename = $ofilename;
    $pi_filename =~ s/_structure\.json$/_item.xml/;

    generate_paged_image_structure($toplevel_div,$htid,$file_sec_ids,$pi_filename);


##    print "**** json_content = $json_content_utf8\n\n";

    $pdCount++;

#    if ($pdCount>5) {
#	exit 0;
#    }

}

sub read_json_file
{
    my ($filename) = @_;

    print STDERR "+ Proccessing file: $filename\n";

    my $json_file_content = "";
    open(JSON_FILE, "<$filename");
    binmode(JSON_FILE,":utf8");

    my $line;
    while (defined ($line=<JSON_FILE>)) {
	$json_file_content .= $line;
    }

    close(JSON_FILE);

    my $json_file_content_utf8 = Encode::encode("utf8",$json_file_content);
    my $json_data = decode_json $json_file_content_utf8;
    
    my $record_hash = $json_data->{'records'};
    my @record_keys = keys %$record_hash;
    my $primary_cat_key = shift @record_keys;

    my $items_entry = $json_data->{'items'};
    my $items_array;

    print STDERR "*** ref: ", ref $items_entry, "\n\n";


    if (ref $items_entry eq "HASH") {
	$items_array = [ $items_entry ];
    }
    else {
	$items_array = $items_entry;
    }

    my $num_items = scalar(@$items_array);

    my $num_pd = 0;

    foreach my $item (@$items_array) {
	
	my $htid = $item->{'htid'};
	my $rights_code = $item->{'rightsCode'};

#	print "htid = $htid\n";
#	print "Rights code = $rights_code\n" if defined $rights_code;

	if (defined($rights_code) && ($rights_code eq "pd")) {
	    # in the public domain
	    $num_pd++;

	    my $htid_safe = uri_escape($htid);

	    my $ofilename = $filename;
	    $ofilename =~ s/\.json/_structure.json/;

	    download_ht_doc($primary_cat_key,$htid,$ofilename);

	    # bail out at first public domain version of document
	    last;
	}
    }

#    if ($num_pd==0) {
#	print "++ $num_items item(s)\n";
#    }
#    else {
#	print "++ $num_items item(s) *of* *which* $num_pd is/are in the public domain\n";
#    }

}


sub process_dir
{
    my ($full_dir) = @_;

#    print "Processing directory: $full_dir\n";
    
    if (opendir(DIN, $full_dir)) {
	my @dir_content = grep { $_ !~ m/^\./ }  sort readdir(DIN);
	closedir DIN;

	foreach my $df (@dir_content) {
	    my $full_df = "$full_dir/$df";
	    if (-d $full_df) {
		my $full_sub_dir = $full_df;
		process_dir($full_sub_dir);
	    }
	    else {
		# file
		my $full_file = $full_df;
		if ($full_file =~ m/\.json$/) {
		    read_json_file($full_file);
		}
	    }
	}

    }
    else {
	
	print STDERR "Error: Failed to open directory: $full_dir\n";
	print STDERR "       $!\n";
    }

}


sub main
{
    my ($argv_ref) = @_;

    my $toplevel_dir = shift @$argv_ref || "output";
    

    $toplevel_dir =~ s/\/$//; # remove any trailing /

    process_dir($toplevel_dir);

}

main(\@ARGV);
