package de.dfki.lt.tools.tokenizer;

import de.dfki.lt.tools.tokenizer.annotate.AnnotatedString;
import de.dfki.lt.tools.tokenizer.annotate.FastAnnotatedString;
import de.dfki.lt.tools.tokenizer.exceptions.LanguageNotSupportedException;
import de.dfki.lt.tools.tokenizer.exceptions.ProcessingException;
import de.dfki.lt.tools.tokenizer.output.Paragraph;
import de.dfki.lt.tools.tokenizer.output.ParagraphOutputter;
import de.dfki.lt.tools.tokenizer.regexp.Match;
import de.dfki.lt.tools.tokenizer.regexp.RegExp;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.StringTokenizer;
import org.hsqldb.Tokens;
import org.hsqldb.server.ServerConstants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/dfki/lt/tools/tokenizer/JTok.class */
public class JTok {
    private static final Logger LOG = LoggerFactory.getLogger(JTok.class);
    private static final String LANGUAGES_PROP = "languages";
    public static final String CLASS_ANNO = "class";
    public static final String BORDER_ANNO = "border";
    public static final String TU_BORDER = "tu";
    public static final String P_BORDER = "p";
    private Map langResources;

    public JTok() throws IOException {
        Properties properties = new Properties();
        properties.load(FileTools.openResourceFileAsStream("jtok/jtok.cfg"));
        init(properties);
    }

    public JTok(Properties properties) {
        init(properties);
    }

    private void init(Properties properties) {
        setLangResources(new HashMap());
        StringTokenizer stringTokenizer = new StringTokenizer(properties.getProperty(LANGUAGES_PROP), Tokens.T_COMMA.intern());
        while (stringTokenizer.hasMoreTokens()) {
            String nextToken = stringTokenizer.nextToken();
            String property = properties.getProperty(nextToken);
            LOG.info("loading language resources for " + nextToken + " from " + property);
            getLangResources().put(nextToken, new LanguageResource(nextToken, property));
        }
    }

    private Map getLangResources() {
        return this.langResources;
    }

    private void setLangResources(HashMap hashMap) {
        this.langResources = hashMap;
    }

    public LanguageResource getLanguageResource(String str) throws LanguageNotSupportedException {
        Object obj = getLangResources().get(str);
        if (null != obj) {
            return (LanguageResource) obj;
        }
        throw new LanguageNotSupportedException("language " + str + " not supported");
    }

    public AnnotatedString tokenize(String str, String str2) {
        LanguageResource languageResource = getLanguageResource(str2);
        FastAnnotatedString fastAnnotatedString = new FastAnnotatedString(str);
        identifyTokens(fastAnnotatedString, languageResource);
        identifyPunct(fastAnnotatedString, languageResource);
        identifyClitics(fastAnnotatedString, languageResource);
        identifyNumbers(fastAnnotatedString, languageResource);
        identifyAbbrev(fastAnnotatedString, languageResource);
        identifyTus(fastAnnotatedString, languageResource);
        return fastAnnotatedString;
    }

    private void identifyTokens(AnnotatedString annotatedString, LanguageResource languageResource) {
        int i = 0;
        boolean z = false;
        String tagName = languageResource.getClassesRoot().getTagName();
        char first = annotatedString.first();
        while (true) {
            char c = first;
            if (c == 65535) {
                break;
            }
            if (Character.isWhitespace(c) || c == 160) {
                if (z) {
                    annotatedString.annotate(CLASS_ANNO, tagName, i, annotatedString.getIndex());
                    z = false;
                }
            } else if (!z) {
                z = true;
                i = annotatedString.getIndex();
            }
            first = annotatedString.next();
        }
        if (z) {
            annotatedString.annotate(CLASS_ANNO, tagName, i, annotatedString.getIndex());
        }
    }

    private void identifyPunct(AnnotatedString annotatedString, LanguageResource languageResource) {
        RegExp allPunctMatcher = languageResource.getAllPunctMatcher();
        RegExp internalMatcher = languageResource.getInternalMatcher();
        RegExp nbrMatcher = languageResource.getNbrMatcher();
        RegExp nblMatcher = languageResource.getNblMatcher();
        char index = annotatedString.setIndex(0);
        if (null == annotatedString.getAnnotation(CLASS_ANNO)) {
            index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
        }
        while (index != 65535) {
            int runLimit = annotatedString.getRunLimit(CLASS_ANNO);
            if (null == annotatedString.getAnnotation(CLASS_ANNO)) {
                index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
            } else {
                String str = (String) annotatedString.getAnnotation(CLASS_ANNO);
                int index2 = annotatedString.getIndex();
                index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
                String substring = annotatedString.substring(index2, runLimit);
                List allMatches = allPunctMatcher.getAllMatches(substring);
                if (0 != allMatches.size()) {
                    int i = 0;
                    for (int i2 = 0; i2 < allMatches.size(); i2++) {
                        Match match = (Match) allMatches.get(i2);
                        if (i == match.getStartIndex()) {
                            if (nblMatcher.matches(match.toString()) && isRightContextEnd(match, allMatches, substring, i2)) {
                            }
                            annotatedString.annotate(CLASS_ANNO, identifyPunctClass(match, null, substring, languageResource), index2 + i, index2 + match.getEndIndex());
                            i = match.getEndIndex();
                        } else if (!internalMatcher.matches(match.toString()) || !isRightContextEnd(match, allMatches, substring, i2)) {
                            if (nbrMatcher.matches(match.toString())) {
                                annotatedString.annotate(CLASS_ANNO, str, index2 + i, index2 + match.getEndIndex());
                                i = match.getEndIndex();
                            } else {
                                annotatedString.annotate(CLASS_ANNO, str, index2 + i, index2 + match.getStartIndex());
                                i = match.getStartIndex();
                                annotatedString.annotate(CLASS_ANNO, identifyPunctClass(match, null, substring, languageResource), index2 + i, index2 + match.getEndIndex());
                                i = match.getEndIndex();
                            }
                        }
                    }
                    if (i != substring.length()) {
                        annotatedString.annotate(CLASS_ANNO, str, index2 + i, index2 + substring.length());
                    }
                }
            }
        }
    }

    private boolean isRightContextEnd(Match match, List list, String str, int i) {
        return i < list.size() - 1 ? ((Match) list.get(i + 1)).getStartIndex() != match.getEndIndex() : match.getEndIndex() != str.length();
    }

    private String identifyPunctClass(Match match, RegExp regExp, String str, LanguageResource languageResource) {
        String identifyClass = identifyClass(match.toString(), regExp, languageResource.getPunctDescr());
        if (identifyClass.equals("OPENCLOSE_PUNCT")) {
            int endIndex = match.getEndIndex();
            if (endIndex >= str.length() || !Character.isLetter(str.charAt(endIndex))) {
                identifyClass = PunctDescription.CLOSE_PUNCT;
            } else {
                int startIndex = match.getStartIndex() - 1;
                if (startIndex < 0 || !Character.isLetter(str.charAt(startIndex))) {
                    identifyClass = PunctDescription.OPEN_PUNCT;
                }
            }
        }
        return identifyClass;
    }

    private void identifyClitics(AnnotatedString annotatedString, LanguageResource languageResource) {
        languageResource.getCliticsMatcher();
        RegExp procliticsMatcher = languageResource.getProcliticsMatcher();
        RegExp encliticsMatcher = languageResource.getEncliticsMatcher();
        RegExp nbrMatcher = languageResource.getNbrMatcher();
        RegExp nblMatcher = languageResource.getNblMatcher();
        String tagName = languageResource.getClassesRoot().getTagName();
        char index = annotatedString.setIndex(0);
        if (null == annotatedString.getAnnotation(CLASS_ANNO)) {
            index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
        }
        while (index != 65535) {
            int runLimit = annotatedString.getRunLimit(CLASS_ANNO);
            if (((String) annotatedString.getAnnotation(CLASS_ANNO)) != tagName) {
                index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
            } else {
                int index2 = annotatedString.getIndex();
                index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
                String substring = annotatedString.substring(index2, runLimit);
                int i = 0;
                int length = substring.length();
                Match contains = nblMatcher.contains(substring);
                Match contains2 = null != contains ? procliticsMatcher.contains(substring.substring(contains.getEndIndex(), length)) : procliticsMatcher.contains(substring);
                if (null != contains && null != contains2) {
                    annotatedString.annotate(CLASS_ANNO, identifyPunctClass(contains, nblMatcher, substring, languageResource), index2 + contains.getStartIndex(), index2 + contains.getEndIndex());
                    i = contains.getEndIndex();
                }
                while (null != contains2) {
                    annotatedString.annotate(CLASS_ANNO, identifyClass(contains2.toString(), procliticsMatcher, languageResource.getClitDescr()), index2 + i + contains2.getStartIndex(), index2 + i + contains2.getEndIndex());
                    i += contains2.getEndIndex();
                    contains2 = procliticsMatcher.contains(substring.substring(i, substring.length()));
                }
                Match contains3 = nbrMatcher.contains(substring);
                Match contains4 = null != contains3 ? encliticsMatcher.contains(substring.substring(i, contains3.getStartIndex())) : encliticsMatcher.contains(substring.substring(i, length));
                if (null != contains3 && null != contains4) {
                    annotatedString.annotate(CLASS_ANNO, identifyPunctClass(contains3, nbrMatcher, substring, languageResource), index2 + contains3.getStartIndex(), index2 + contains3.getEndIndex());
                }
                while (null != contains4) {
                    annotatedString.annotate(CLASS_ANNO, identifyClass(contains4.toString(), encliticsMatcher, languageResource.getClitDescr()), index2 + i + contains4.getStartIndex(), index2 + i + contains4.getEndIndex());
                    length = i + contains4.getStartIndex();
                    contains4 = encliticsMatcher.contains(substring.substring(i, length));
                }
                if (i != length) {
                    annotatedString.annotate(CLASS_ANNO, tagName, index2 + i, index2 + length);
                }
            }
        }
    }

    private void identifyNumbers(AnnotatedString annotatedString, LanguageResource languageResource) {
        RegExp simpleDigitsMatcher = languageResource.getSimpleDigitsMatcher();
        RegExp ordinalMatcher = languageResource.getOrdinalMatcher();
        RegExp digitsMatcher = languageResource.getDigitsMatcher();
        String tagName = languageResource.getClassesRoot().getTagName();
        char index = annotatedString.setIndex(0);
        if (null == annotatedString.getAnnotation(CLASS_ANNO)) {
            index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
        }
        while (index != 65535) {
            int runLimit = annotatedString.getRunLimit(CLASS_ANNO);
            if (((String) annotatedString.getAnnotation(CLASS_ANNO)) != tagName) {
                index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
            } else {
                int index2 = annotatedString.getIndex();
                index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
                String substring = annotatedString.substring(index2, runLimit);
                if (null != simpleDigitsMatcher.contains(substring)) {
                    boolean z = false;
                    if ('.' == substring.charAt(substring.length() - 1)) {
                        z = true;
                        if (ordinalMatcher.matches(substring)) {
                            annotatedString.annotate(CLASS_ANNO, identifyClass(substring, ordinalMatcher, languageResource.getNumbDescr()), index2, runLimit);
                        } else {
                            substring = substring.substring(0, substring.length() - 1);
                            runLimit--;
                        }
                    }
                    Match contains = digitsMatcher.contains(substring);
                    if (null != contains) {
                        annotatedString.annotate(CLASS_ANNO, identifyClass(contains.toString(), digitsMatcher, languageResource.getNumbDescr()), index2 + contains.getStartIndex(), index2 + contains.getEndIndex());
                        if (z) {
                            annotatedString.annotate(CLASS_ANNO, identifyClass(ServerConstants.SC_DEFAULT_WEB_ROOT.intern(), null, languageResource.getPunctDescr()), runLimit, runLimit + 1);
                        }
                    }
                }
            }
        }
    }

    private void identifyAbbrev(AnnotatedString annotatedString, LanguageResource languageResource) {
        RegExp nbrMatcher = languageResource.getNbrMatcher();
        RegExp abbrevMatcher = languageResource.getAbbrevMatcher();
        RegExp initialMatcher = languageResource.getInitialMatcher();
        HashMap abbrevLists = languageResource.getAbbrevLists();
        String tagName = languageResource.getClassesRoot().getTagName();
        char index = annotatedString.setIndex(0);
        if (null == annotatedString.getAnnotation(CLASS_ANNO)) {
            index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
        }
        while (index != 65535) {
            int runLimit = annotatedString.getRunLimit(CLASS_ANNO);
            if (((String) annotatedString.getAnnotation(CLASS_ANNO)) != tagName) {
                index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
            } else {
                int index2 = annotatedString.getIndex();
                index = annotatedString.setIndex(annotatedString.findNextAnnotation(CLASS_ANNO));
                String substring = annotatedString.substring(index2, runLimit);
                Match contains = nbrMatcher.contains(substring);
                if (null != contains && contains.getEndIndex() - contains.getStartIndex() == 1 && annotatedString.charAt(index2 + contains.getStartIndex()) == '.') {
                    int lastIndexOf = substring.lastIndexOf("-");
                    if (lastIndexOf != -1) {
                        String substring2 = substring.substring(lastIndexOf + 1);
                        if (substring2.matches("[^0-9]{2,}")) {
                            substring = substring2;
                        }
                    }
                    boolean z = false;
                    Iterator it = abbrevLists.keySet().iterator();
                    while (true) {
                        if (!it.hasNext()) {
                            break;
                        }
                        String str = (String) it.next();
                        if (((Set) abbrevLists.get(str)).contains(substring)) {
                            annotatedString.annotate(CLASS_ANNO, str, index2, index2 + contains.getEndIndex());
                            z = true;
                            break;
                        }
                    }
                    if (!z) {
                        if (initialMatcher != null && initialMatcher.matches(substring)) {
                            annotatedString.annotate(CLASS_ANNO, identifyClass(substring, initialMatcher, languageResource.getAbbrevDescr()), index2, index2 + contains.getEndIndex());
                        } else if (abbrevMatcher.matches(substring)) {
                            annotatedString.annotate(CLASS_ANNO, identifyClass(substring, abbrevMatcher, languageResource.getAbbrevDescr()), index2, index2 + contains.getEndIndex());
                        } else {
                            annotatedString.annotate(CLASS_ANNO, identifyPunctClass(contains, nbrMatcher, substring, languageResource), index2 + contains.getStartIndex(), index2 + contains.getEndIndex());
                        }
                    }
                }
            }
        }
    }

    private void identifyTus(AnnotatedString annotatedString, LanguageResource languageResource) {
        RegExp internalTuMatcher = languageResource.getInternalTuMatcher();
        boolean z = false;
        boolean z2 = false;
        char index = annotatedString.setIndex(0);
        while (index != 65535) {
            int runStart = annotatedString.getRunStart(CLASS_ANNO);
            int runLimit = annotatedString.getRunLimit(CLASS_ANNO);
            if (null != annotatedString.getAnnotation(CLASS_ANNO)) {
                if (z) {
                    if (!languageResource.isAncestor("TERM_PUNCT".intern(), (String) annotatedString.getAnnotation(CLASS_ANNO)) && !languageResource.isAncestor("TERM_PUNCT_P".intern(), (String) annotatedString.getAnnotation(CLASS_ANNO)) && !languageResource.isAncestor(PunctDescription.CLOSE_PUNCT.intern(), (String) annotatedString.getAnnotation(CLASS_ANNO)) && !languageResource.isAncestor(PunctDescription.CLOSE_BRACKET.intern(), (String) annotatedString.getAnnotation(CLASS_ANNO))) {
                        if (Character.isLowerCase(index) || internalTuMatcher.matches(annotatedString.substring(annotatedString.getIndex(), annotatedString.getIndex() + 1))) {
                            z = false;
                        } else {
                            annotatedString.annotate(BORDER_ANNO, "tu", runStart, runStart + 1);
                            z = false;
                        }
                    }
                } else if (z2) {
                    if (languageResource.getNonCapTerms().contains(annotatedString.substring(runStart, runLimit)) || languageResource.isAncestor(PunctDescription.OPEN_PUNCT.intern(), (String) annotatedString.getAnnotation(CLASS_ANNO))) {
                        annotatedString.annotate(BORDER_ANNO, "tu", runStart, runStart + 1);
                    }
                    z2 = false;
                } else if (languageResource.isAncestor("TERM_PUNCT".intern(), (String) annotatedString.getAnnotation(CLASS_ANNO)) || languageResource.isAncestor("TERM_PUNCT_P".intern(), (String) annotatedString.getAnnotation(CLASS_ANNO))) {
                    z = true;
                } else if (languageResource.isAncestor("B_ABBREVIATION".intern(), (String) annotatedString.getAnnotation(CLASS_ANNO))) {
                    z2 = true;
                }
                index = annotatedString.setIndex(runLimit);
            } else if (isParagraphChange(annotatedString.substring(runStart, runLimit))) {
                z = false;
                z2 = false;
                index = annotatedString.setIndex(runLimit);
                if (index != 65535) {
                    annotatedString.annotate(BORDER_ANNO, "p", annotatedString.getIndex(), annotatedString.getIndex() + 1);
                }
            } else {
                index = annotatedString.setIndex(runLimit);
            }
        }
    }

    private boolean isParagraphChange(String str) {
        int length = str.length();
        for (int i = 0; i < length; i++) {
            char charAt = str.charAt(i);
            if ('\n' == charAt || '\r' == charAt) {
                for (int i2 = i + 1; i2 < length; i2++) {
                    if (charAt == str.charAt(i2)) {
                        return true;
                    }
                }
            }
        }
        return false;
    }

    private String identifyClass(String str, RegExp regExp, Description description) {
        String str2;
        if (null != regExp && null != (str2 = (String) description.getRegExpMap().get(regExp))) {
            return str2;
        }
        HashMap definitionsMap = description.getDefinitionsMap();
        for (String str3 : definitionsMap.keySet()) {
            if (((RegExp) definitionsMap.get(str3)).matches(str)) {
                return str3;
            }
        }
        throw new ProcessingException("could not find class for " + str);
    }

    public boolean isAncestor(String str, String str2, String str3) throws ProcessingException {
        LanguageResource languageResource = getLanguageResource(str3);
        String str4 = (String) languageResource.getClassesMap().get(str);
        if (null == str4) {
            throw new ProcessingException("undefined token class tag " + str);
        }
        String str5 = (String) languageResource.getClassesMap().get(str2);
        if (null == str5) {
            throw new ProcessingException("undefined token class tag " + str2);
        }
        return languageResource.isAncestor(str4, str5);
    }

    public static void main(String[] strArr) {
        if (strArr.length != 2 && strArr.length != 3) {
            System.out.println("This method needs two arguments:\n- a file name for the document to tokenize\n- the language of the document\n- an optional encoding to use (default is ISO-8859-1)\nSupported languages are: de, en, it");
            System.exit(1);
        }
        String str = null;
        try {
            str = FileTools.readFileAsString(new File(strArr[0]), strArr.length == 3 ? strArr[2] : "ISO-8859-1");
        } catch (IOException e) {
            System.err.println(e.toString());
            System.exit(1);
        }
        try {
            Iterator<Paragraph> it = ParagraphOutputter.createParagraphs(new JTok().tokenize(str, strArr[1])).iterator();
            while (it.hasNext()) {
                System.out.println(it.next());
            }
        } catch (IOException e2) {
            LOG.error(e2.getLocalizedMessage(), e2);
        }
    }
}
