/*
 * LuceneSearcher.java
 *
 * $Author: tsakas $
 * $Date: 2007/12/20 14:37:39 $
 * $Id: LuceneSearcher.java,v 1.1 2007/12/20 14:37:39 tsakas Exp $
 *
 * <pre>
 *             Copyright (c) : 2006 Fast Search & Transfer ASA
 *                             ALL RIGHTS RESERVED
 * </pre>
 */

package org.gcube.indexmanagement.lucenewrapper;

import gr.uoa.di.madgik.grs.events.KeyValueEvent;
import gr.uoa.di.madgik.grs.proxy.tcp.TCPWriterProxy;
import gr.uoa.di.madgik.grs.record.GenericRecord;
import gr.uoa.di.madgik.grs.record.GenericRecordDefinition;
import gr.uoa.di.madgik.grs.record.Record;
import gr.uoa.di.madgik.grs.record.RecordDefinition;
import gr.uoa.di.madgik.grs.record.field.FieldDefinition;
import gr.uoa.di.madgik.grs.record.field.StringFieldDefinition;
import gr.uoa.di.madgik.grs.writer.RecordWriter;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map.Entry;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.gcube.common.core.utils.logging.GCUBELog;
import org.gcube.indexmanagement.common.FullTextIndexType;
import org.gcube.indexmanagement.common.IndexContext;
import org.gcube.indexmanagement.common.IndexException;
import org.gcube.indexmanagement.common.IndexField;
import org.gcube.indexmanagement.common.IndexSearch;
import org.gcube.indexmanagement.common.IndexType;
import org.gcube.indexmanagement.common.PropertyElementLookup;
import org.gcube.indexmanagement.common.linguistics.lemmatizerplugin.Language;
import org.gcube.indexmanagement.common.linguistics.lemmatizerplugin.LemmatizerFactory;
import org.gcube.indexmanagement.common.linguistics.lemmatizerplugin.LemmatizerPlugin;
import org.gcube.indexmanagement.resourceregistry.RRadaptor;

/**
 * An IndexSerch implementation used to search a Lucene index.
 */
public class LuceneSearcher implements IndexSearch {
	
	public enum SupportedRelations { adj, fuzzy, proximity, within };
	
	//'=' is also a supported relation
	public static final String EQUALS = "=";
	
	public static final String ALL_INDEXES = "allIndexes";

	/* Log4j */
	static GCUBELog logger = new GCUBELog(LuceneSearcher.class);

	/** The directory where the local index is stored */
	// private final String localIndexDir =
	// IndexServiceConst.LOCAL_INDEX_DIRECTORY + "lucene/";
	/** The Lucene searcher to use for querying the Index */
	private Searcher searcher;

	/** The Lucene index reader to be used in order to find Query statistics */
	private IndexReader reader;

	/** The IndexType of the Index to be queried */
	private FullTextIndexType idxType = null;

	/** The path to the index to create query */
	private String indexPath = null;

	private LemmatizerPlugin lepl = null;

	private Language defaultLanguage;

	private boolean doExpand;

	private float lowBoostValue = 0.1f;

	public ArrayList<String> fieldNames = new ArrayList<String>();
	private HashMap<String, Float> defaultLowBoostFields = new HashMap<String, Float>();
	private HashMap<String, Float> noLowBoostFields = new HashMap<String, Float>();

	private StandardAnalyzer analyzer = new StandardAnalyzer(new String[0]);
	private QueryParser defaultParser = new QueryParser("_contents", analyzer);

	private RRadaptor adaptor = null;
	
	private Integer snippetSize;
	private Integer maximumSnippetsCount;
    
	
	public void setSnippetSize(Integer snippetSize) {
		this.snippetSize = snippetSize;
	}

	public void setMaximumSnippetsCount(Integer maximumSnippetsCount) {
		this.maximumSnippetsCount = maximumSnippetsCount;
	}
	
	

	private IndexSearcher createLuceneIndexSearcher(String path) throws IOException
    {
    	GlobalSimilarity similarity = new GlobalSimilarity();
    	IndexSearcher indexSearcher = new IndexSearcher(path);
    	indexSearcher.setSimilarity(similarity);
    	return indexSearcher;
    }    

	/**
	 * constructor
	 */
	public LuceneSearcher(String lemmatizerPluginName, String configFilePath,
			Language defaultLanguage) throws Exception {
		logger.debug("Loading plugin using factory, config file: "
				+ configFilePath);
		try {
			this.defaultLanguage = defaultLanguage;
			this.doExpand = true;

			Vector<Language> languages = new Vector<Language>();
			lepl = LemmatizerFactory.loadPlugin(lemmatizerPluginName);

			languages.add(Language.en);
			languages.add(Language.it);
			languages.add(Language.fr);
			languages.add(Language.es);
			languages.add(Language.de);
			// languages.add(Language.sv);
			// languages.add(Language.af);
			lepl.init(configFilePath, languages);
		} catch (Throwable e) {
			logger.error("unable to add language", e);
			throw new Exception(e);
		}
		logger.debug("Lemmatizer initiated");
	}

	public LuceneSearcher() throws Exception {
		this.doExpand = false;
	}

	/**
	 * {@inheritDoc}
	 */
	public void openIndex(String indexDir, String indexName)
			throws IndexException {
		try {
			this.indexPath = indexDir + indexName;
			this.searcher = createLuceneIndexSearcher(indexPath);
			this.reader = IndexReader.open(indexPath);
		} catch (Exception e) {
			logger.error("unable to open index", e);
			throw new IndexException(e);
		}
	}

	/**
	 * {@inheritDoc}
	 */
	public void updateIndex() throws IndexException {
		try {
			try{
				searcher.close();
			} catch (Exception e) {
				logger.error("could not close searcher while updating the Index: ", e);
			}
			try{
				reader.close();
			} catch (Exception e) {
				logger.error("could not close reader while updating the Index: ", e);
			}
			searcher = createLuceneIndexSearcher(indexPath);
			reader = IndexReader.open(indexPath);
		} catch (Exception e) {
			throw new IndexException(e);
		}
	}

	/**
	 * {@inheritDoc}
	 */
	public void setIndexType(FullTextIndexType idxType) throws IndexException {
		this.idxType = idxType;

		fieldNames.clear();
		for (int count = 0; count < idxType.getNumberOfFields(); count++) {
			fieldNames.add(idxType.getFields()[count].name);
		}

		defaultLowBoostFields.clear();
		for (String name : fieldNames) {
			defaultLowBoostFields.put(name, lowBoostValue);
		}
	}

	/**
	 * {@inheritDoc}
	 */
	public void closeIndex() throws IndexException {
		// nothing to do
	}

	/**
	 * {@inheritDoc}
	 */
	public URI executeCqlQuery(ArrayList<String> presentable, ArrayList<String> searchable, 
			String queryString, boolean isComplete, RRadaptor adaptor) throws Exception {
		
		this.adaptor  = adaptor;
		
		//preprocess the CQl query and create a lucene query
		LuceneGcqlProcessor preprocessor = new LuceneGcqlProcessor();
		LuceneGcqlQueryContainer queryContainer = 
			(LuceneGcqlQueryContainer)preprocessor.processQuery(presentable, searchable, queryString, adaptor);
		return invokeWorker(presentable, queryContainer.getProjectedFields(), queryContainer.getLuceneQuery(), isComplete);		
	}
	
	/**
	 * @deprecated - This method applies a query expansion, before executing the query, that assumes
	 * an old Index version was used (Index Library version < 3.0.0) 
	 * {@inheritDoc}
	 */
	public URI executeQuery(String queryString, boolean isComplete) throws IndexException {
		
		if (idxType != null) {
			if (doExpand) {
				// ######## Expand the query to handle language and field prefix
				// #######
				String defaultLang = defaultLanguage.toShortString();
				boolean onlyLemmatize = true;

				logger.debug("Expanding the query: " + queryString);

				if (queryString.startsWith("_querylang_")
						|| queryString.startsWith("_querylemlang_")) {
					int langStart;
					if (queryString.startsWith("_querylang_")) {
						langStart = 11;
						onlyLemmatize = false;
					} else {
						langStart = 14;
						onlyLemmatize = true;
					}
					int prefixEnd = queryString.indexOf(':');
					defaultLang = queryString.substring(langStart, prefixEnd);
					queryString = queryString.substring(prefixEnd + 1);

				}

				queryString = expandQuery(queryString, defaultLang,
						defaultLang, onlyLemmatize);
				logger.debug("Expanded query: " + queryString);

			}
			QuerySnippetTermsPair query = new QuerySnippetTermsPair();
			query.query = queryString;
			return invokeWorker(new ArrayList<String>(), new LinkedHashMap<String, String>(), query, isComplete);

			
		} else {
			// If the idxType is null, the local index is empty.
			// Return an empty RS at once, in order to avoid nullpointer trouble
			try {
				RecordWriter<GenericRecord> rsWriter = new RecordWriter<GenericRecord>(
						new TCPWriterProxy(), new RecordDefinition[] {new GenericRecordDefinition(
								(new FieldDefinition[] { //A record can contain a number of different field definitions
								        new StringFieldDefinition()          //The definition of the field
								      }))});
				
				//send an event for the total number of results
				rsWriter.emit(new KeyValueEvent(IndexType.RESULTSNO_EVENT, "0"));
				rsWriter.close();
				return rsWriter.getLocator();			
				
			} catch (Exception e) {
				logger.error("error returning empty ResultSet", e);
				throw new IndexException(e);
			}
		}
	}
	
	private URI invokeWorker(ArrayList<String> presentable, LinkedHashMap<String, String> projections, QuerySnippetTermsPair querySnippetTermsPair, boolean isComplete) throws IndexException{
		QuerySnippetTermsPair originalQuery = querySnippetTermsPair;
		RecordWriter<GenericRecord> rsWriter = null;
		URI uri = null;
		
		//give the definition for the record
		FieldDefinition[] fieldDef = null;
		try {
			fieldDef = createFieldDefinition(presentable, projections, isComplete);
		} catch (Exception e) {
			logger.error("Could not create field definition: ", e);
			throw new IndexException(e);
		}
		
		RecordDefinition[] definition = new RecordDefinition[]{new GenericRecordDefinition(fieldDef)};

		Hits queryHits = null;
		int numberOfHits = -1;
		Query query = null;
		
		try {
			query = defaultParser.parse(querySnippetTermsPair.query);
		} catch (ParseException e) {
			throw new IndexException(e);
		}

		// Retrieve query terms
		QueryTerm[] terms = QueryTermFilter.getTerms(query);

		// Execute query
		try {
			queryHits = searcher.search(query);
			numberOfHits = queryHits.length();
			logger.debug("the query \"" + originalQuery.query + "\" gave "
					+ numberOfHits + " hits.");

			//since the new RS supports transparently flow control, we won't
			//put a limitation on the max number of results
			//int maxHits = numberOfHits > 1000 ? 1000 : numberOfHits;
			int maxHits = numberOfHits;
			
			long startWriter=Calendar.getInstance().getTimeInMillis();
			rsWriter = new RecordWriter<GenericRecord>(new TCPWriterProxy(), definition);
			//send an event for the total number of results
			rsWriter.emit(new KeyValueEvent(IndexType.RESULTSNOFINAL_EVENT, "" + maxHits));
			long stopWriter=Calendar.getInstance().getTimeInMillis();

			long startWorker=Calendar.getInstance().getTimeInMillis();
			LuceneSearchWorker worker = new LuceneSearchWorker(rsWriter,
					reader, terms, queryHits, maxHits,
					idxType, isComplete, presentable, projections, 
					querySnippetTermsPair, adaptor);
			
			if (snippetSize != null) 
				worker.setSnippetSize(snippetSize);
			if (maximumSnippetsCount != null) 
				worker.setMaximumSnippetsCount(maximumSnippetsCount);

			
			
			worker.start();
			long stopWorker=Calendar.getInstance().getTimeInMillis();
			long startLocator=Calendar.getInstance().getTimeInMillis();
			uri = rsWriter.getLocator();
			long stopLocator=Calendar.getInstance().getTimeInMillis();
			logger.debug("Factory writer : "+(stopWriter-startWriter));
			logger.debug("Populating thread init : "+(stopWorker-startWorker));
			logger.debug("Locator retrieval : "+(stopLocator-startLocator));
		} catch (Exception e) {
			logger.error("error executing lucene query: \"" + querySnippetTermsPair
					+ "\"", e);
			throw new IndexException(e);
		}
		return uri;
	}
	
	private FieldDefinition[] createFieldDefinition(ArrayList<String> presentable, LinkedHashMap<String, String> projections, boolean isComplete) throws Exception{
		ArrayList<FieldDefinition> fieldDef = new ArrayList<FieldDefinition>();
		//add three more fields for the score, the statistics and the docID
		fieldDef.add(new StringFieldDefinition(IndexType.SCORE_FIELD));
		fieldDef.add(new StringFieldDefinition(IndexType.STATS_FIELD));
		fieldDef.add(new StringFieldDefinition(IndexType.DOCID_FIELD));
		
		//these cases correspond to the way the worker fills the RS
		//the plus 3 fields are for score, stats and docID 
		if(!isComplete) {
			
			
			if(projections == null || projections.size() ==0) {
				
				// we won't add anything more if no projection is defined
				
			} else {
				
				//in case there is the wildcard in projections
            	if(projections.containsValue(IndexType.WILDCARD)) {
            	
            		//return all the presentable fields (we assume that its the updater's responsibility 
            		// to check for the fields to be returnable, stored) except for the full payload
    				for (String fieldName : presentable) {
                        
                        String fieldID = adaptor.getFieldIDFromName(fieldName);
                        
                        //if a field is not the ObjectID or full payload field
                        if (!fieldName.equalsIgnoreCase(IndexType.DOCID_FIELD)
                                && !fieldName.equalsIgnoreCase(IndexType.PAYLOAD_FIELD)) {
                        	fieldDef.add(new StringFieldDefinition(fieldID));
                        }
                    }
            		
            	} else {
				
					for(Entry<String, String> current : projections.entrySet()) {
						fieldDef.add(new StringFieldDefinition(current.getKey()));
					}
				
            	}
			}
			
		} else {
			fieldDef.add(new StringFieldDefinition(IndexType.PAYLOAD_FIELD));
		}		
		
		return fieldDef.toArray(new FieldDefinition[fieldDef.size()]);
	}

	public String expandQuery(final String query, String defaultLemLang,
			String defaultLang, boolean onlyLemByDefault) throws IndexException {
		int idx = 0;
		int querySize = query.length();
		StringBuffer newQueryBuf = new StringBuffer();
		newQueryBuf.append('(');

		while (idx < (querySize - 1)) {
			String lemLang, lang, field, tempTerm, term;
			int termStopIdx;

			// ignore spaces
			while (idx < (querySize - 1) && query.charAt(idx) == ' ') {
				idx++;
			}
			if (idx == querySize) {
				break;
			}

			// Handle language specified terms
			if ((tempTerm = query.substring(idx)).startsWith("_lang_")
					|| tempTerm.startsWith("_lemlang_")) {
				boolean onlyLemmatize;

				if (tempTerm.startsWith("_lemlang_")) {
					onlyLemmatize = onlyLemByDefault;
					lang = defaultLang;
					idx += 9;
				} else {
					onlyLemmatize = false;
					lang = null;// will be initialized after lemlang.
					idx += 6;
				}

				StringBuffer langBuf = new StringBuffer();
				while (idx < querySize && query.charAt(idx) != '_'
						&& query.charAt(idx) != ':') {
					langBuf.append(query.charAt(idx));
					idx++;
				}
				if (idx == querySize) {
					break;
				}

				lemLang = langBuf.toString();
				if (tempTerm.startsWith("_lang_")) {
					lang = lemLang;
				}

				// Handle language specified terms which also have specified
				// fields
				if (query.charAt(idx) == '_') {
					idx++;
					StringBuffer fieldBuf = new StringBuffer();
					while (idx < querySize && query.charAt(idx) != ':') {
						fieldBuf.append(query.charAt(idx));
						idx++;
					}
					while (idx < (querySize - 1) && query.charAt(idx) == ' ') {
						idx++;
					}
					if (idx == querySize) {
						break;
					}

					field = fieldBuf.toString();

					idx++;
					termStopIdx = getTermStop(query, idx);
					if (query.charAt(idx) == '(') {
						// expand words inside group
						term = expandQuery(query.substring(idx + 1,
								termStopIdx - 1), lemLang, lang, onlyLemmatize);
					} else {
						term = query.substring(idx, termStopIdx);
					}
					idx = termStopIdx;
					newQueryBuf.append(" ").append(
							expandFieldedLangTerm(term, lemLang, lang, field,
									onlyLemmatize));
				}
				// Handle language specified terms which don't have specified
				// fields
				else {
					idx++;
					while (idx < (querySize - 1) && query.charAt(idx) == ' ') {
						idx++;
					}
					if (idx == querySize) {
						break;
					}
					termStopIdx = getTermStop(query, idx);
					if (query.charAt(idx) == '(') {
						// expand words inside group
						term = expandQuery(query.substring(idx + 1,
								termStopIdx - 1), lemLang, lang, onlyLemmatize);
					} else {
						term = query.substring(idx, termStopIdx);
					}
					idx = termStopIdx;
					newQueryBuf.append(" ").append(
							expandLangTerm(term, lemLang, lang, onlyLemmatize));
				}
			} else {
				// Handle terms which don't have specified language
				termStopIdx = getTermStop(query, idx);
				if (query.charAt(idx) == '(') {
					// expand words inside group
					term = expandQuery(query
							.substring(idx + 1, termStopIdx - 1),
							defaultLemLang, defaultLang, onlyLemByDefault);
				} else {
					term = query.substring(idx, termStopIdx);
				}
				idx = termStopIdx;

				if (!term.trim().equalsIgnoreCase("AND")
						&& !term.trim().equalsIgnoreCase("OR")) {
					newQueryBuf.append(" ").append(
							expandLangTerm(term, defaultLemLang, defaultLang,
									onlyLemByDefault));
				} else {
					newQueryBuf.append(" ").append(term);
				}
			}
		}
		newQueryBuf.append(')');
		return newQueryBuf.toString();
	}

	private String expandFieldedLangTerm(String term, String lemLang,
			String lang, String field, boolean onlyLemmatize)
			throws IndexException {
		term = lemExpand(term, lemLang);

		StringBuffer newTerm = new StringBuffer("( ");

		if (!onlyLemmatize) {
			newTerm.append("_lang_").append(lang).append("_").append(field)
					.append(":(").append(term).append(")");
		}

		newTerm.append(field).append(":(").append(term).append(")");

		if (!onlyLemmatize) {
			newTerm.append("^").append(lowBoostValue);
		}

		newTerm.append(")");

		return newTerm.toString();
	}

	private String expandLangTerm(String term, String lemLang, String lang,
			boolean onlyLemmatize) throws IndexException {
		try {

			term = lemExpand(term, lemLang);

			ArrayList<String> list;
			HashMap<String, Float> lowBoostFields;

			if (onlyLemmatize) {
				lowBoostFields = noLowBoostFields;
				list = fieldNames;
			} else {
				lowBoostFields = defaultLowBoostFields;

				String langPrefix = "_lang_" + lang + "_";
				list = new ArrayList<String>();
				for (String name : fieldNames) {
					list.add(langPrefix + name);
				}
				list.addAll(fieldNames); // will receive low boost
			}

			MultiFieldQueryParser parser = new MultiFieldQueryParser(list
					.toArray(new String[list.size()]), analyzer, lowBoostFields);

			String newTerm = parser.parse(term).toString();

			return "(" + newTerm + ")";
		} catch (Exception e) {
			logger.error("error expanding language specified term: \"" + term
					+ "\"", e);
			throw new IndexException(e);
		}
	}

	private String lemExpand(String term, String lang) throws IndexException {
		term = term.trim();
		if (term.charAt(0) == '\"' || term.charAt(0) == '('
				|| term.charAt(0) == '[' || term.charAt(0) == '{') {
			return term; // only lemmatize single word terms; not phrases,
			// groups or ranges.
		}

		Language language = Language.valueOf(lang);
		String addedWords = lepl.lemmatize_word(term, language);
		logger.debug("found addedWords: " + addedWords + ", for term: " + term);

		StringBuffer expandedWord = new StringBuffer("(").append(term);
		for (String replacement : addedWords.split("#")) {
			if (replacement.trim().length() > 0
					&& !replacement.trim().equals("No")) {
				expandedWord.append(" OR ").append(replacement);
			} else {
				return term;
			}
		}
		return expandedWord.append(")").toString();
	}

	private int getTermStop(String query, int termStart) {
		char openingDelimiter;
		char closingDelimiter = ' ';
		char escapeChar = '\\';
		int querySize = query.length();
		int openedGroups = 0;

		openingDelimiter = query.charAt(termStart);
		if (openingDelimiter == '\"') {
			closingDelimiter = '\"';
		} else if (openingDelimiter == '(') {
			closingDelimiter = ')';
			openedGroups++;
		} else if (openingDelimiter == '[') {
			closingDelimiter = ']';
		} else if (openingDelimiter == '{') {
			closingDelimiter = '}';
		} else {
			openingDelimiter = ' ';
		}

		int idx = termStart;

		while (idx < (querySize - 1)) {
			idx++;
			int openCountChange = 0;

			if (query.charAt(idx) == closingDelimiter) {
				if (openingDelimiter == '(') {
					openCountChange = -1;
				}
			} else if (query.charAt(idx) == openingDelimiter) {
				if (openingDelimiter == '(') {
					openCountChange = 1;
				}
			} else {
				continue; // not closing, not opening, not interested...
			}

			if (openingDelimiter != ' ') {
				int escCounter = 0;
				while (query.charAt(idx - escCounter - 1) == escapeChar) {
					escCounter++;
				}
				if (escCounter % 2 != 0) {// uneven # of escChars => delimiter
					// was escaped
					continue;
				}
			}

			openedGroups += openCountChange;

			if (query.charAt(idx) == closingDelimiter && openedGroups == 0) {
				break; // the end of the term is reached
			}
		}
		return ++idx;
	}

}
