package org.hathitrust.lrl.pagelevel;

import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.hathitrust.lrl.core.UtilIO;
import org.hathitrust.lrl.core.VolPageIdFreq;

public class DownloadPageHotspots {

	public static boolean DEBUG = false;
	
	public static final int FreqThreshold;
	
	static {
		if (DEBUG) {
			System.err.println("Running with DEBUG=true => Lowering FreqThreshold to 4");
			System.err.println("This is allow for entries to be found when processing a small dataset");
			FreqThreshold = 4;
		}
		else {
			FreqThreshold = 5;
		}
	}
	public static ArrayList<VolPageIdFreq> readVolPageIdFreq(String input_filename)
	{
		ArrayList<VolPageIdFreq> freq_volpage_list = new ArrayList<VolPageIdFreq>();
		
		ArrayList<String> text_lines = UtilIO.readLineBasedTextFile(input_filename);
		for (String line: text_lines) {
			List<String> items = Arrays.asList(line.split("\\s*,\\s*"));
			String volpage_id = items.get(0);
			volpage_id = volpage_id.replaceAll("^\"|\"$", "");
			String freq_str = items.get(1);
			int freq = Integer.parseInt(freq_str);
			
			VolPageIdFreq vol_page_freq = new VolPageIdFreq(volpage_id,freq);
			
			freq_volpage_list.add(vol_page_freq);
		}
		
		return freq_volpage_list;
	}
	
	public static ArrayList<VolPageIdFreq>  filterAboveFreqValue(ArrayList<VolPageIdFreq> freq_volpage_ids, int freq_threshold)
	{
		ArrayList<VolPageIdFreq> filtered_volpage_ids = new ArrayList<VolPageIdFreq>();
		
		for (VolPageIdFreq volpage_freq: freq_volpage_ids) {
			if (volpage_freq.getFreq() >= freq_threshold) {
				filtered_volpage_ids.add(volpage_freq);
			}
		}
		
		return filtered_volpage_ids;
		
	}
	
	public static void main(String[] args) 
	{	
		//String freq_sorted_volpage_filename = UtilIO.LRL_HOME + File.separator + "sorted-data-output" 
		//		+ File.separator + "maori-1000-fulltextsearch-volid-freq--10x5000.csv";
		String freq_sorted_volpage_filename = UtilIO.LRL_HOME + File.separator + "sorted-data-output" 
				+ File.separator + "maori-1000-fulltextsearch-volid-freq.csv";
		
		System.out.println("Reading in volpage frequency data file: " + freq_sorted_volpage_filename);
		ArrayList<VolPageIdFreq> freq_volpage_ids = readVolPageIdFreq(freq_sorted_volpage_filename);
		
		ArrayList<VolPageIdFreq> filtered_volpage_ids = filterAboveFreqValue(freq_volpage_ids,FreqThreshold);
		
		int num_filtered_volpage_ids = filtered_volpage_ids.size();
		
		int pcount = 0;
		
		for (VolPageIdFreq volpage_freq: filtered_volpage_ids) {
			String volpage_id = volpage_freq.getID();
			int freq = volpage_freq.getFreq();
			
			String output_dir_str = UtilIO.LRL_HOME + File.separator + "data-output" 
					+ File.separator + "page-ef-json" + File.separator + freq;
			File output_dir = new File(output_dir_str);
			
			if (!output_dir.exists()) {
				output_dir.mkdirs();
			}

			pcount++;
		
			System.out.println("Downloading #" + pcount + "/" + num_filtered_volpage_ids + "(vol-page-id: " + volpage_id + ")");
			System.out.println("  Saving EF JSON to directory:" + output_dir_str);
			UtilIO.downloadEFJson(volpage_id,output_dir_str);
			
			if ((DEBUG) && (pcount >= 3)) {
				System.out.flush();
				System.err.println("**** cutting Download EF JSON Pages short");
				break;
			}
		}
		
		System.out.flush();
		System.err.println("Done!");
		
	}

}
