/**
 * TextcatLangid
 *
 * $Author: tsakas $
 * $Date: 2007/12/20 14:37:40 $
 * $Id: JTextCatPlugin.java,v 1.1 2007/12/20 14:37:40 tsakas Exp $
 *
 * <pre>
 *             Copyright (c) : 2006 Fast Search & Transfer ASA
 *                             ALL RIGHTS RESERVED
 * </pre>
 */

package org.gcube.indexmanagement.common.linguistics.jtextcat;

import org.knallgrau.utils.textcat.TextCategorizer;
import java.lang.String;

import org.gcube.indexmanagement.common.IndexException;
import org.gcube.indexmanagement.common.linguistics.languageidplugin.LanguageIdPlugin;
import org.gcube.indexmanagement.common.linguistics.lemmatizerplugin.Language;

import java.util.Vector;
import java.util.Iterator;
import java.util.NoSuchElementException;

/**
 * The class that provides methods for language identification.
 */
public class JTextCatPlugin implements LanguageIdPlugin {

    /*
     * The text categorizer.
     */
    private TextCategorizer guesser;

    /*
     * The language list to translate between short and longnames
     */
    private Vector<Language> languageVector;
    
    /**
     * Empty constructor.
     */
    public JTextCatPlugin() {
    }
    
    /**
     * Method that initialises the implementation of the language id.
     * @param configFile      The config file needed by the identify language
     * @throws                IndexException when the language_identifier can not be created
     */
    public void init(String configFile) throws IndexException {
        guesser = new TextCategorizer();
        languageVector = new Vector<Language>();
        for (Language ll: Language.values()) {
            languageVector.add(ll);
        }
    }
    
    /**
     * Detects the language in the document
     * @param  document     The document 
     * @return              The ISO string of the language. The ISO string can be 
     *                      converted by the language class to the "ISO enum"
     *                      The string "nolang" is returned if no language can be identified.
     *                      The string "not_enough_data" is returned if the document string is 
     *                      too short.
     * @throws              IndexException in case of a failure.
     *                      or in case the an unknown language, or if the 
     *                      input document is too short to classify the language.
     */
    public String detectLanguage(String document) throws IndexException {
        // Result is of the form #iso-8859-1:50
        // fr - French
        // it - Italian
        // nb - Norvegian
        // en - English
        String res;
        try {

            res = guesser.categorize(document);

            Iterator it = languageVector.iterator();
            
            while (it.hasNext()){
                Language l = (Language) it.next();
                if (res.equalsIgnoreCase(l.toLongString())) {
                    res = l.toShortString();
                }                    
            }
        }
        catch (Exception ex) {
            return "nolang";
        }
        if (res.equals("unknown")) {
            return "nolang";
        }
        else {
            return res;
        }
    }
}
