#!/usr/bin/env python3

# Program to convert xref JSON metadata files into MongoDB JSON files suitable
# for importing into a mongo database

import pymongo, gzip, argparse
import json, re, os, sys
import threading, concurrent.futures
import alive_progress

import logging

Logger  = None
Verbose = 0

TypesAcceptedCounter = {
    "journal-article":     0,
    "proceedings-article": 0,
    "book-chapter":        0
}

# TypeCounters = [0] * len(TypesAccepted)

TypesAcceptedCounterLock = threading.Lock()
                
def openMongoDBCollection(mongodb_uri,mongodb_database,mongodb_collection):

    logging.info("Connecting to MongoDB Database")
    mongodb_client = pymongo.MongoClient(mongodb_uri)
    mongodb_database = mongodb_client[mongodb_database]
    mongodb_collection = mongodb_database[mongodb_collection]

    return mongodb_collection
    
        
#convert std xref metadata file to json string format
def convertXRefFile(full_input_filename):
    # dictionary for all items in the current file
    try:
        jsonItems = []
        logging.info("   Unzipping and opening " + full_input_filename)
        with gzip.open(full_input_filename, "r") as gzippedFile:
            #load the json in the file as a dictionary in python
            jsonDictionary = json.load(gzippedFile)
            #for each item in the file:
            for item in jsonDictionary["items"]:
                # create a dictionary containing relevant contents of
                # that item and add it to another dictionary to dump
                # in future

                if "type" not in item:
                    continue

                type = item["type"]
                
                if type not in TypesAcceptedCounter:
                    continue

                TypesAcceptedCounterLock.acquire()
                TypesAcceptedCounter[type] += 1
                TypesAcceptedCounterLock.release()

                #dictionary to store a single item
                dictToDump = {}  #{"_id":numFile,}
                if "DOI" in item.keys():
                    dictToDump.update({"DOI":item["DOI"]})
                if "title" in item.keys():
                    dictToDump.update({"title":item["title"][0]})
                if "subtitle" in item.keys():
                    dictToDump.update({"subtitle":item["subtitle"][0]})
                if "author" in item.keys():
                    dictToDump.update({"authors":item["author"]})
                dictToDump.update({"issued":item["issued"]["date-parts"]})  # **** XXXX this was in DB version, but not JC tidyup
                dictToDump.update({"dateIndexed":item["indexed"]["date-time"]})
                dictToDump.update({"dateCreated":item["created"]["date-time"]})
                dictToDump.update({"dateDeposited":item["deposited"]["date-time"]})
                dictToDump.update({"typeOfWork":type})
                jsonItems.append(dictToDump)
                #logging.info("Processed Item " + dictToDump["DOI"])

        if len(jsonItems) == 0: return None 

        #now we have an array of things to dump, lest return it as a string
        stringToReturn = ""

        #format for mongodb
        for item in jsonItems:
            stringToReturn += json.dumps(item)
            stringToReturn += ","

        #should get format {item},{item},{item}
        stringToReturn = stringToReturn[:-1]

        # logging.info("Returning processed Items from " + full_input_filename)

        if Verbose > 1:
            logging.info("   Current total of converted records: " + repr(TypesAcceptedCounter))
        
        return stringToReturn

    except Exception as inst:
        #handle error
        logging.critical("Failed to process " + full_input_filename + " with exception " + str(inst))
        return None


def processFile(full_input_filename,opt_mongodb_collection,opt_output_dirname):

    input_dirname  = os.path.dirname(full_input_filename)
    input_filename = os.path.basename(full_input_filename)
    
    converted_md_json_string = convertXRefFile(full_input_filename)

    if opt_mongodb_collection != None:
        try:
            converted_md = json.loads("[" + converted_md_json_string + "]")
            opt_mongodb_collection.insert_many(converted_md)
            logging.info("   Inserted " + full_input_filename)
        except Exception as inst:
            logging.critical("Failed to insert " + full_input_filename + " with exception " + str(inst))
    else:
        full_output_filename = os.path.join(opt_output_dirname,"mongo-"+input_filename)
        full_output_filename = re.sub(r'\.json.gz$',".json",full_output_filename)
        
        with open(full_output_filename, "w", encoding="utf-8") as outputFile:
            outputFile.write("[" + converted_md_json_string + "]")
            
    return "Processed: " + full_input_filename

def listDirFiles(input_dirname,maxfile_count):

    json_gz_filenames = []

    file_count = 0
    for filename in os.listdir(input_dirname):
        if filename.endswith(".json.gz"):
            full_filename = os.path.join(input_dirname,filename)
            json_gz_filenames.append(full_filename)

            file_count = file_count + 1
            if file_count == maxfile_count:
                break

    return json_gz_filenames

    
#process a directory of gzipped json metadata files into json strings and put them into files
def processDir(input_dirname, num_threads,maxfile_count, opt_mongodb_collection,opt_output_dirname):

    json_gz_filenames = listDirFiles(input_dirname,maxfile_count)

    # Based on:
    #   https://www.digitalocean.com/community/tutorials/how-to-use-threadpoolexecutor-in-python-3
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []

        for json_gz_filename in json_gz_filenames:
            futures.append(
                executor.submit(
                    processFile, json_gz_filename, opt_mongodb_collection, opt_output_dirname
                )
            )

        with alive_progress.alive_bar(len(futures)) as bar:
                
            for future in concurrent.futures.as_completed(futures):
                try:
                    print(future.result())
                except Exception as exception:
                    logging.critical(str(exception))
                    
                bar()
    
    return

# main entry point and argument parser    
def main():
    global Verbose

    parser = argparse.ArgumentParser(
        prog="xRefToMongo",
        description="Convert and ingest xRef metadata in GZIP compressed JSON files into MongoDB"
    )
    parser.add_argument("inputDirOrFile", metavar="input-directory|inputfile.json.gz",
                        help="input directory (or specific file) of GZIP compressed JSON formatted xRef metadata records")
    parser.add_argument("dbConnectionOrOutputDir", metavar="MongoDB URL connection|output-directory",
                        help="MongoDB URL connection in the form mongo://localhost:27017/ to stream the processed JSON data to,"
                        +" or else an output directory where the process JSON files will be saved to")
    
    parser.add_argument("-v", "--verbose",              dest="verbose",                 default=Verbose, action="count",
                        help="enable verbose output")
    parser.add_argument("-c", "--maxfile-count",        dest="maxfile_count",           metavar="<int>", type=int,
                        help="number of files to convert if working with full metadata archive")
    parser.add_argument("-t", "--num-threads",          dest="num_threads",             metavar="<int>", type=int, default=1,
                        help="number of files to convert if working with full metadata archive")
    parser.add_argument("-db", "--mongodb-database",    dest="mongodb_database_name",   metavar="<db-name>", default="MetadataQuality",
                        help="export directly to mongodb database (requires collection)")
    parser.add_argument("-col", "--mongodb-collection", dest="mongodb_collection_name", metavar="<col-name>", default="xref",
                        help="the collection to store JSON metadata records, in the specified database")

    # when maxfile_count not specified, equals None
    parsedArgs = parser.parse_args()

    input_dir_or_file             = parsedArgs.inputDirOrFile
    db_connection_or_output_dir   = parsedArgs.dbConnectionOrOutputDir

    # set up logger so that verbosity works
    Verbose       = parsedArgs.verbose
    logging.basicConfig(level=logging.INFO)
    Logger = logging.getLogger("verbosityLogger")
    Logger.setLevel(Verbose * 10)

    # determine if dealing with an input dir or file
    input_dirname  = None
    input_filename = None

    if os.path.isdir(input_dir_or_file):
        input_dirname = input_dir_or_file
    else:
        input_filename = input_dir_or_file
        
    # determine if dealing with an MongoDB Connection or output_dir
    mongodb_uri    = None    
    output_dirname = None

    if db_connection_or_output_dir.startswith("mongodb://"):
        mongodb_uri = db_connection_or_output_dir
    else:
        output_dirname = db_connection_or_output_dir
        if not os.path.exists(output_dirname):
            logging.critical("Output directory does not exists: " + output_dirname);
            sys.exit(1)

    # Initial variables optional argument
    maxfile_count           = parsedArgs.maxfile_count
    num_threads             = parsedArgs.num_threads    
    mongodb_database_name   = parsedArgs.mongodb_database_name
    mongodb_collection_name = parsedArgs.mongodb_collection_name

    mongodb_collection = None
    
    if mongodb_uri != None:     
        mongodb_collection = openMongoDBCollection(mongodb_uri,mongodb_database_name,mongodb_collection_name)
        
    # if filepath is a directory
    if input_dirname != None:
        processDir(input_dirname, num_threads,maxfile_count, mongodb_collection,output_dirname)
    else:
        processFile(input_filename,mongodb_collection,output_dirname)        

    print("----")
    print("Converted records: " + repr(TypesAcceptedCounter))
    print("----")
        
if __name__ == "__main__": 
    main()
