/**********************************************************************
 *
 * PDFBoxToImagesAndText.java based on Apache PDFBox®'s PDFToImage.java
 * with further code spliced in from its ExtractImages.java with some
 * minor modifications.
 *
 * The code in this file is therefore under the same Apache License
 * version 2.0 as Apache's PDFBox.
 *
 * Copyright 2018 The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the Apache License version 2.0.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the Apache License version 2.0
 * along with this file; if not, refer to
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * The following comment is from the original file,
 * PDFBox's PDFToImage.java
 * 
 *********************************************************************/
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.greenstone.pdfbox;

import java.awt.HeadlessException;
import java.awt.Toolkit;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import org.apache.pdfbox.io.IOUtils;

import javax.imageio.ImageIO;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;

import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.text.PDFTextStripper;

/**
 * This class is based on PDFToImage.java which converts
 * the pages of a PDF document to images.
 * This class should convert the pages to images and 
 * extract the text of each page. The latter part of the code
 * is taken from ExtractText.java.
 * Variables textOnly and imagesOnly determine which aspect
 * is output for each page, or whether both an image and text
 * are output per page.
 *
 * Built on Apache PDFBox's PDFToImage.java with minor modifications.
 * ak19
 */
public final class PDFBoxToImagesAndText
{
    private static final String PASSWORD = "-password";
    private static final String ENCODING = "-encoding";
    private static final String START_PAGE = "-startPage";
    private static final String END_PAGE = "-endPage";
    private static final String PAGE = "-page";
    private static final String IMAGE_TYPE = "-imageType";
    private static final String FORMAT = "-format";
    private static final String OUTPUT_PREFIX = "-outputPrefix";
    private static final String PREFIX = "-prefix";
    private static final String COLOR = "-color";
    private static final String RESOLUTION = "-resolution";
    private static final String DPI = "-dpi";
    private static final String CROPBOX = "-cropbox";
    private static final String TIME = "-time";
    private static final String TEXT_ONLY = "-textOnly"; // output just the text per page
    private static final String IMAGES_ONLY = "-imagesOnly"; // output just an image per page
    
    private static final String STD_ENCODING = "UTF-8";

    /**
     * private constructor.
    */
    private PDFBoxToImagesAndText()
    {
        //static class
    }

    /**
     * Infamous main method.
     *
     * @param args Command line arguments, should be one and a reference to a file.
     *
     * @throws IOException If there is an error parsing the document.
     */
    public static void main( String[] args ) throws IOException
    {
        try
        {
            // force KCMS (faster than LCMS) if available
            Class.forName("sun.java2d.cmm.kcms.KcmsServiceProvider");
            System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
        }
        catch (ClassNotFoundException e)
        {
            // do nothing
        }

        // suppress the Dock icon on OS X
        System.setProperty("apple.awt.UIElement", "true");

        String password = "";
	String encoding = STD_ENCODING;
        String pdfFile = null;
        String outputPrefix = null;
        String imageFormat = "jpg";
        int startPage = 1;
        int endPage = Integer.MAX_VALUE;
        String color = "rgb";
        int dpi;
        float cropBoxLowerLeftX = 0;
        float cropBoxLowerLeftY = 0;
        float cropBoxUpperRightX = 0;
        float cropBoxUpperRightY = 0;
        boolean showTime = false;
	boolean textOnly = false;
	boolean imagesOnly = false;
	
        try
        {
            dpi = Toolkit.getDefaultToolkit().getScreenResolution();
        }
        catch( HeadlessException e )
        {
            dpi = 96;
        }
        for( int i = 0; i < args.length; i++ )
        {
            if( args[i].equals( PASSWORD ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                password = args[i];
            }
            else if( args[i].equals( ENCODING ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                encoding = args[i];
            }
            else if( args[i].equals( START_PAGE ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                startPage = Integer.parseInt( args[i] );
            }
            else if( args[i].equals( END_PAGE ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                endPage = Integer.parseInt( args[i] );
            }
            else if( args[i].equals( PAGE ) )
            {
                i++;
                if( i >= args.length )
                {
                    usage();
                }
                startPage = Integer.parseInt( args[i] );
                endPage = Integer.parseInt( args[i] );
            }
            else if( args[i].equals(IMAGE_TYPE) || args[i].equals(FORMAT) )
            {
                i++;
                imageFormat = args[i];
            }
            else if( args[i].equals( OUTPUT_PREFIX ) || args[i].equals( PREFIX ) )
            {
                i++;
                outputPrefix = args[i];
            }
            else if( args[i].equals( COLOR ) )
            {
                i++;
                color = args[i];
            }
            else if( args[i].equals( RESOLUTION ) || args[i].equals( DPI ) )
            {
                i++;
                dpi = Integer.parseInt(args[i]);
            }
            else if( args[i].equals( CROPBOX ) )
            {
                i++;
                cropBoxLowerLeftX = Float.valueOf(args[i]);
                i++;
                cropBoxLowerLeftY = Float.valueOf(args[i]);
                i++;
                cropBoxUpperRightX = Float.valueOf(args[i]);
                i++;
                cropBoxUpperRightY = Float.valueOf(args[i]);
            }
	    else if( args[i].equals( TEXT_ONLY ) )
            {
                textOnly = true;
            }
	    else if( args[i].equals( IMAGES_ONLY ) )
            {
                imagesOnly = true;
            }
            else if( args[i].equals( TIME ) )
            {
                showTime = true;
            }
            else
            {
                if( pdfFile == null )
                {
                    pdfFile = args[i];
                }
            }
        }
        if( pdfFile == null )
        {
            usage();
        }
        else
        {
            if(outputPrefix == null)
            {
                outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));
            }

            PDDocument document = null;
            try
            {
		boolean extractingTextAllowed = true;		
		//String outputFile = null;

		/*startProcessing("Loading PDF "+pdfFile);
		if( outputFile == null && pdfFile.length() >4 )
                {
                    outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
                }*/

                document = PDDocument.load(new File(pdfFile), password);

		AccessPermission ap = document.getCurrentAccessPermission();
                if( ! ap.canExtractContent() )
                {
                    //throw new IOException( "You do not have permission to extract text" );
		    System.err.println( "*** You do not have permission to extract text" ); // still extract the pages as images then?
		    extractingTextAllowed = false;
                }
		//stopProcessing("Time for loading: ", startTime);

		// don't extract to HTML in this class, just extract to txt
		PDFTextStripper stripper = new PDFTextStripper();            
                //stripper.setSortByPosition( sort );
                //stripper.setShouldSeparateByBeads( separateBeads );
		stripper.setShouldSeparateByBeads( true );
                

                ImageType imageType = null;
                if ("bilevel".equalsIgnoreCase(color))
                {
                    imageType = ImageType.BINARY;
                }
                else if ("gray".equalsIgnoreCase(color))
                {
                    imageType = ImageType.GRAY;
                }
                else if ("rgb".equalsIgnoreCase(color))
                {
                    imageType = ImageType.RGB;
                }
                else if ("rgba".equalsIgnoreCase(color))
                {
                    imageType = ImageType.ARGB;
                }
                
                if (imageType == null)
                {
                    System.err.println( "Error: Invalid color." );
                    System.exit( 2 );
                }

                //if a CropBox has been specified, update the CropBox:
                //changeCropBoxes(PDDocument document,float a, float b, float c,float d)
                if ( cropBoxLowerLeftX!=0 || cropBoxLowerLeftY!=0
                        || cropBoxUpperRightX!=0 || cropBoxUpperRightY!=0 )
                {
                    changeCropBox(document,
                            cropBoxLowerLeftX, cropBoxLowerLeftY,
                            cropBoxUpperRightX, cropBoxUpperRightY);
                }

                long startTime = System.nanoTime();
		
                // render the pages
                boolean success = true;
                endPage = Math.min(endPage, document.getNumberOfPages());
                PDFRenderer renderer = new PDFRenderer(document);
                for (int i = startPage - 1; i < endPage; i++)
                {
		    int lastSlash = outputPrefix.lastIndexOf(File.separator);
		    outputPrefix = outputPrefix.substring(0, lastSlash+1); // include the folder (/) but not the filename prefix
		    String fileName = outputPrefix + (i + 1) + ".";
		    
		    if(!textOnly) {
			// turn page into image
			BufferedImage image = renderer.renderImageWithDPI(i, dpi, imageType);			
			success &= ImageIOUtil.writeImage(image, fileName+imageFormat, dpi);
		    }

                    // image version of page done, now extract text from current page
                    if(!imagesOnly && extractingTextAllowed) {
			    Writer output = null;	
			    try {
				    output = new OutputStreamWriter( new FileOutputStream( fileName+"txt" ), encoding );
				    stripper.setStartPage( i+1 );
				    stripper.setEndPage( i+1 );
				
				    //if (debug)
				    //{
				    	System.err.println("Writing to "+fileName);
				    //}

				    // Extract text for main document, the specified pages
				    stripper.writeText( document, output );
			    } catch (Exception ex) {
				System.err.println( "*** Unable to create txt file " + fileName + "txt. Exception: " + ex.getMessage());
			    } finally {
		            	IOUtils.closeQuietly(output);
			    }
                    }
                }

		// GS NOTE: We just extracted text for (each page of) the main document, but
		// we're not additionally extracting text for any "embedded PDFs" as is done in ExtractText.java

                // performance stats
                long endTime = System.nanoTime();
                long duration = endTime - startTime;
                int count = 1 + endPage - startPage;
                if (showTime)
                {
                    System.err.printf("Rendered %d page%s in %dms\n", count, count == 1 ? "" : "s",
                                      duration / 1000000);
                }

                if (!success)
                {
                    System.err.println( "Error: no writer found for image format '"
                            + imageFormat + "'" );
                    System.exit(1);
                }
            }
            finally
            {
                if( document != null )
                {
                    document.close();
                }
            }
        }
    }

    /**
     * This will print the usage requirements and exit.
     */
    private static void usage()
    {
        String message = "Usage: java -jar pdfbox-app-x.y.z.jar PDFBoxToImagesAndText [options] <inputfile>\n"
            + "\nOptions:\n"
            + "  -password  <password>            : Password to decrypt document\n"
            + "  -encoding  <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n"
            + "  -format <string>                 : Image format: " + getImageFormats() + "\n"
            + "  -prefix <string>                 : Filename prefix for image files\n"
            + "  -page <number>                   : The only page to extract (1-based)\n"
            + "  -startPage <int>                 : The first page to start extraction (1-based)\n"
            + "  -endPage <int>                   : The last page to extract(inclusive)\n"
            + "  -color <int>                     : The color depth (valid: bilevel, gray, rgb, rgba)\n"
            + "  -dpi <int>                       : The DPI of the output image\n"
            + "  -cropbox <int> <int> <int> <int> : The page area to export\n"
            + "  -time                            : Prints timing information to stdout\n"
            + "  <inputfile>                      : The PDF document to use\n";
        
        System.err.println(message);
        System.exit( 1 );
    }

    private static String getImageFormats()
    {
        StringBuilder retval = new StringBuilder();
        String[] formats = ImageIO.getReaderFormatNames();
        for( int i = 0; i < formats.length; i++ )
        {
           if (formats[i].equalsIgnoreCase(formats[i]))
           {
               retval.append( formats[i] );
               if( i + 1 < formats.length )
               {
                   retval.append( ", " );
               }
           }
        }
        return retval.toString();
    }

    private static void changeCropBox(PDDocument document, float a, float b, float c, float d)
    {
        for (PDPage page : document.getPages())
        {
            System.out.println("resizing page");
            PDRectangle rectangle = new PDRectangle();
            rectangle.setLowerLeftX(a);
            rectangle.setLowerLeftY(b);
            rectangle.setUpperRightX(c);
            rectangle.setUpperRightY(d);
            page.setCropBox(rectangle);

        }
    }
}
