# -*- coding: utf-8 -*-
##
## This code is based on Invenio software modules,
## modified for OCR processing in D4Science-II project.
## 
## OCR use case contact in D4Science-II: Jukka Klem
## Code main developer: Juliusz Sompolski
##
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

import os
import re
import sys
import shutil
import tempfile
import HTMLParser
import time

from logging import debug, error, DEBUG, getLogger
from htmlentitydefs import entitydefs
from optparse import OptionParser
from invenio.hocrlib import create_pdf, extract_hocr

from invenio.shellutils import run_process_with_timeout
from invenio.config import *
from pyPdf import PdfFileReader, PdfFileWriter
from pyPdf.generic import NameObject, createStringObject

#logger = getLogger()
#logger.setLevel(DEBUG)

CFG_TWO2THREE_LANG_CODES = {
    'en': 'eng',
    'nl': 'nld',
    'es': 'spa',
    'de': 'deu',
    'it': 'ita',
    'fr': 'fra',
}

_RE_CLEAN_SPACES = re.compile(r'\s+')


class InvenioWebSubmitFileConverterError(Exception):
    pass

def pdf2pdfhocr(input_pdf, text_hocr, output_pdf, rotations = [], font='Courier', draft=False):
    """
    Adds the OCRed text to the original pdf.
    @param rotations: a list of angles by which pages should be rotated
    """
    def _get_page_rotation(i):
        if len(rotations) > i:
            return rotations[i]
        return 0
    
    input_pdf, hocr_pdf, dummy = prepare_io(input_pdf, output_ext='.pdf', need_working_dir=False)
    create_pdf(extract_hocr(open(text_hocr).read()), hocr_pdf, font, draft)
    input1 = PdfFileReader(file(input_pdf, "rb"))
    input2 = PdfFileReader(file(hocr_pdf, "rb"))
    output = PdfFileWriter()
    
    info = input1.getDocumentInfo()
    if info:
        infoDict = output._info.getObject()
        infoDict.update(info)
    
    for i in range(0, input1.getNumPages()):
        orig_page = input1.getPage(i)
        text_page = input2.getPage(i)
        angle = _get_page_rotation(i)
        if angle != 0:
            print >> sys.stderr,  "Rotating page %d by %d degrees." % (i, angle)
            text_page = text_page.rotateClockwise(angle)
        if draft:
            below, above = orig_page, text_page
        else:
            below, above = text_page, orig_page
        below.mergePage(above)
        output.addPage(below)
    outputStream = file(output_pdf, "wb")
    output.write(outputStream)
    outputStream.close()

def guess_ocropus_produced_garbage(input_file, hocr_p):
    """Return True if the output produced by OCROpus in hocr format contains
    only garbage instead of text. This is implemented via an heuristic:
    if the most common length for sentences encoded in UTF-8 is 1 then
    this is Garbage (tm).
    """

    def _get_words_from_text():
        ret = []
        for row in open(input_file):
            for word in row.strip().split(' '):
                ret.append(word.strip())
        return ret

    def _get_words_from_hocr():
        ret = []
        hocr = extract_hocr(open(input_file).read())
        for dummy, dummy, lines in hocr:
            for dummy, line in lines:
                for word in line.split():
                    ret.append(word.strip())
        return ret

    if hocr_p:
        words = _get_words_from_hocr()
    else:
        words = _get_words_from_text()
    #stats = {}
    #most_common_len = 0
    #most_common_how_many = 0
    #for word in words:
        #if word:
            #word_length = len(word.decode('utf-8'))
            #stats[word_length] = stats.get(word_length, 0) + 1
            #if stats[word_length] > most_common_how_many:
                #most_common_len = word_length
                #most_common_how_many = stats[word_length]
    goods = 0
    bads = 0
    for word in words:
        for char in word.decode('utf-8'):
            if (u'a' <= char <= u'z') or (u'A' <= char <= u'Z'):
                goods += 1
            else:
                bads += 1
    if bads > goods:
        debug('OCROpus produced garbage')
        return True
    else:
        return False


_RE_FIND_TITLE = re.compile(r'^Title:\s*(.*?)\s*$')

def pdf2pdfopt(input_file, output_file=None, **dummy):
    """
    Linearize the input PDF in order to improve the web-experience when
    visualizing the document through the web.
    @param input_file [string] the input input_file
    @param output_file [string] the output_file file name, None for temporary generated
    @return [string] output_file input_file
    raise InvenioWebSubmitFileConverterError in case of errors.
    """
    input_file, output_file, dummy = prepare_io(input_file, output_file, '.pdf', need_working_dir=False)
    execute_command(CFG_PATH_PDFOPT, input_file, output_file)
    return output_file

def pdf2hocr2pdf(input_file, pdf_output_file=None, ln='en', font="Courier", author=None, keywords=None, subject=None, title=None, draft=False, pdfopt=True, **dummy):
    """
    Return the text content in input_file.
    @param ln is a two letter language code to give the OCR tool a hint.
    @param return_working_dir if set to True, will return output_file path and the working_dir path, instead of deleting the working_dir. This is useful in case you need the intermediate images to build again a PDF.
    """

    def _perform_rotate(working_dir, imagefile, angle):
        """Rotate imagefile of the corresponding angle. Creates a new file
        with rotated- as prefix."""
        debug('Performing rotate on %s by %s degrees' % (imagefile, angle))
        if not angle:
            #execute_command('%s %s %s', CFG_PATH_CONVERT, os.path.join(working_dir, imagefile), os.path.join(working_dir, 'rotated-%s' % imagefile))
            shutil.copy(os.path.join(working_dir, imagefile), os.path.join(working_dir, 'rotated.ppm'))
        else:
            execute_command(CFG_PATH_CONVERT, os.path.join(working_dir, imagefile), '-rotate', str(angle), '-depth', str(8), os.path.join(working_dir, 'rotated.ppm'))
        return True

    def _perform_deskew(working_dir):
        """Perform ocroscript deskew. Expect to work on rotated-imagefile.
        Creates deskewed-imagefile.
        Return True if deskewing was fine."""
        debug('Performing deskew.')
        try:
            dummy, stderr = execute_command_with_stderr(CFG_PATH_OCROSCRIPT, 'deskew', os.path.join(working_dir, 'rotated.ppm'), os.path.join(working_dir, 'deskewed.ppm'))
            if stderr.strip():
                debug('Errors found during deskewing')
                return False
            else:
                return True
        except InvenioWebSubmitFileConverterError, err:
            print >> sys.stderr, 'Deskewing error: %s' % err
            return False

    def _perform_recognize(working_dir):
        """Perform ocroscript recognize. Expect to work on deskewed-imagefile.
        Creates recognized.out Return True if recognizing was fine."""
        debug('Performing recognize on %s' % imagefile)
        try:
            dummy, stderr = execute_command_with_stderr(CFG_PATH_OCROSCRIPT, 'recognize', '--tesslanguage=%s' % ln, '--output-mode=hocr', os.path.join(working_dir, 'deskewed.ppm'), filename_out=os.path.join(working_dir, 'recognize.out'))
            if stderr.strip():
                ## There was some output on stderr
                debug('Errors found in recognize.err')
                return False
            return not guess_ocropus_produced_garbage(os.path.join(working_dir, 'recognize.out'), True)
        except InvenioWebSubmitFileConverterError, err:
            print >> sys.stderr, 'Recognizer error: %s' % err
            return False

    def _perform_dummy_recognize(working_dir, imagefile):
        """Return an empty text or an empty hocr referencing the image."""
        debug('Performing dummy recognize on %s' % imagefile)
        stdout = stderr = ''
        out = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta content="ocr_line ocr_page" name="ocr-capabilities"/><meta content="en" name="ocr-langs"/><meta content="Latin" name="ocr-scripts"/><meta content="" name="ocr-microformats"/><title>OCR Output</title></head>
<body><div class="ocr_page" title="bbox 0 0 1 1; image %s">
</div></body></html>""" % os.path.join(working_dir, imagefile)
        open(os.path.join(working_dir, 'recognize.out'), 'w').write(out)

    def _find_image_file(working_dir, imageprefix, page):
        ret = '%s-%d.ppm' % (imageprefix, page) 
        if os.path.exists(os.path.join(working_dir, ret)):
            return ret
        ret = '%s-%02d.ppm' % (imageprefix, page) 
        if os.path.exists(os.path.join(working_dir, ret)):
            return ret
        ret = '%s-%03d.ppm' % (imageprefix, page) 
        if os.path.exists(os.path.join(working_dir, ret)):
            return ret
        ret = '%s-%04d.ppm' % (imageprefix, page) 
        if os.path.exists(os.path.join(working_dir, ret)):
            return ret
        ret = '%s-%05d.ppm' % (imageprefix, page) 
        if os.path.exists(os.path.join(working_dir, ret)):
            return ret
        ret = '%s-%06d.ppm' % (imageprefix, page) 
        if os.path.exists(os.path.join(working_dir, ret)):
            return ret
        ## I guess we won't have documents with more than million pages
        return None

    if CFG_PATH_OCROSCRIPT:
        if len(ln) == 2:
            ln = CFG_TWO2THREE_LANG_CODES.get(ln, 'eng')
        input_file, hocr_output_file, working_dir = prepare_io(input_file, input_file+'.hocr', '.hocr')
        
        page = 0
        rotations = []
        while True:
            page = page + 1
            print >> sys.stderr, 'Page %d.' % page
            execute_command(CFG_PATH_PDFTOPPM, '-f', str(page), '-l', str(page), '-r', str(CFG_PPM_RESOLUTION), '-aa', 'yes', '-freetype', 'yes', input_file, os.path.join(working_dir, 'image'))
            imagefile = _find_image_file(working_dir, 'image', page)
            if imagefile == None:
                break
            for angle in (0,180,90,270):
                print >> sys.stderr, 'Trying %d degrees...' % angle
                if _perform_rotate(working_dir, imagefile, angle) and _perform_deskew(working_dir) and _perform_recognize(working_dir):
                    rotations.append(angle)
                    break
            else:
                print >> sys.stderr, 'Dummy recognize'
                rotations.append(0)
                _perform_dummy_recognize(working_dir, imagefile)
            open(hocr_output_file, 'a').write(open(os.path.join(working_dir, 'recognize.out')).read())
            # clean
            os.remove(os.path.join(working_dir, imagefile))

        input_file, tmp_output_file, dummy = prepare_io(input_file, output_ext='.pdf', need_working_dir=False)
        pdf2pdfhocr(input_file, hocr_output_file, tmp_output_file, rotations)
        if pdfopt:
            pdf_output_file = pdf2pdfopt(tmp_output_file, pdf_output_file)
            os.remove(tmp_output_file)
        else:
            shutil.move(tmp_output_file, pdf_output_file)
        clean_working_dir(working_dir)
        return pdf_output_file
    else:
        raise InvenioWebSubmitFileConverterError("It's impossible to generate HOCR output from PDF. OCROpus is not available.")

def prepare_io(input_file, output_file=None, output_ext=None, need_working_dir=True):
    """Clean input_file and the output_file."""
    debug('Preparing IO for input=%s, output=%s, output_ext=%s' % (input_file, output_file, output_ext))
    if output_ext is None:
        if output_file is None:
            output_ext = '.tmp'
        else:
            output_ext = os.path.splitext(output_file)[1]
    if output_file is None:
        try:
            (fd, output_file) = tempfile.mkstemp(suffix=output_ext, dir=CFG_TMPDIR)
            os.close(fd)
        except IOError, err:
            raise InvenioWebSubmitFileConverterError("It's impossible to create a temporary file: %s" % err)
    else:
        output_file = os.path.abspath(output_file)
        if os.path.exists(output_file):
            os.remove(output_file)

    if need_working_dir:
        try:
            working_dir = tempfile.mkdtemp(dir=CFG_TMPDIR, prefix='conversion')
        except IOError, err:
            raise InvenioWebSubmitFileConverterError("It's impossible to create a temporary directory: %s" % err)

        input_ext = os.path.splitext(input_file)[1]
        new_input_file = os.path.join(working_dir, 'input' + input_ext)
        shutil.copy(input_file, new_input_file)
        input_file = new_input_file
    else:
        working_dir = None
        input_file = os.path.abspath(input_file)

    debug('IO prepared: input_file=%s, output_file=%s, working_dir=%s' % (input_file, output_file, working_dir))
    return (input_file, output_file, working_dir)


def clean_working_dir(working_dir):
    """
    Remove the working_dir.
    """
    debug('Cleaning working_dir: %s' % working_dir)
    shutil.rmtree(working_dir, ignore_errors = True)


def execute_command(*args, **argd):
    """Wrapper to run_process_with_timeout."""
    debug("Executing: %s" % (args, ))
    res, stdout, stderr = run_process_with_timeout(args, cwd=argd.get('cwd'), filename_out=argd.get('filename_out'), filename_err=argd.get('filename_err'))
    if res != 0:
        error("Error when executing %s" % (args, ))
        raise InvenioWebSubmitFileConverterError("Error in running %s\n stdout:\n%s\nstderr:\n%s\n" % (args, stdout, stderr))
    return stdout


def execute_command_with_stderr(*args, **argd):
    """Wrapper to run_process_with_timeout."""
    debug("Executing: %s" % (args, ))
    res, stdout, stderr = run_process_with_timeout(args, cwd=argd.get('cwd'), filename_out=argd.get('filename_out'))
    if res != 0:
        error("Error when executing %s" % (args, ))
        raise InvenioWebSubmitFileConverterError("Error in running %s\n stdout:\n%s\nstderr:\n%s\n" % (args, stdout, stderr))
    return stdout, stderr
