package com.rapidminer.operator;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.AttributeWeights;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.condition.InnerOperatorCondition;
import com.rapidminer.operator.condition.LastInnerOperatorCondition;
import com.rapidminer.operator.extraction.ExtractingInputFilter;
import com.rapidminer.operator.extraction.ExtractionException;
import com.rapidminer.operator.extraction.util.FeatureExtractionUtil;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeFile;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.ParameterTypeStringCategory;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
import com.rapidminer.tools.ObjectVisualizerService;
import edu.udo.cs.wvtool.config.WVTConfiguration;
import edu.udo.cs.wvtool.config.WVTConfigurationFact;
import edu.udo.cs.wvtool.generic.vectorcreation.BinaryOccurrences;
import edu.udo.cs.wvtool.generic.vectorcreation.TFIDF;
import edu.udo.cs.wvtool.generic.vectorcreation.TermFrequency;
import edu.udo.cs.wvtool.generic.vectorcreation.TermOccurrences;
import edu.udo.cs.wvtool.generic.vectorcreation.WVTVectorCreator;
import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.main.WVTInputList;
import edu.udo.cs.wvtool.main.WVTTokenSequence;
import edu.udo.cs.wvtool.main.WVTool;
import edu.udo.cs.wvtool.util.WVToolException;
import edu.udo.cs.wvtool.util.WVToolLogger;
import edu.udo.cs.wvtool.wordlist.WVTWordList;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/* loaded from: input_file:com/rapidminer/operator/TextInput.class */
public abstract class TextInput extends OperatorChain {
    public static final String PARAMETER_DEFAULT_CONTENT_TYPE = "default_content_type";
    public static final String PARAMETER_DEFAULT_CONTENT_ENCODING = "default_content_encoding";
    public static final String PARAMETER_DEFAULT_CONTENT_LANGUAGE = "default_content_language";
    public static final String PARAMETER_PRUNE_BELOW = "prune_below";
    public static final String PARAMETER_PRUNE_ABOVE = "prune_above";
    public static final String PARAMETER_VECTOR_CREATION = "vector_creation";
    public static final String PARAMETER_USE_CONTENT_ATTRIBUTES = "use_content_attributes";
    public static final String PARAMETER_USE_GIVEN_WORD_LIST = "use_given_word_list";
    public static final String PARAMETER_INPUT_WORD_LIST = "input_word_list";
    public static final String PARAMETER_RETURN_WORD_LIST = "return_word_list";
    public static final String PARAMETER_OUTPUT_WORD_LIST = "output_word_list";
    public static final String PARAMETER_ID_ATTRIBUTE_TYPE = "id_attribute_type";
    public static final String PARAMETER_TEXT_QUERY = "text_query";
    public static final String PARAMETER_CREATE_TEXT_VISUALIZER = "create_text_visualizer";
    public static final String PARAMETER_ON_THE_FLY_PRUNING = "on_the_fly_pruning";
    public static Class<?>[] vectorCreation = {TermFrequency.class, TermOccurrences.class, TFIDF.class, BinaryOccurrences.class};
    private ClassNameMapper vectorCreationMapper;
    protected ExampleSet inputExampleSet;
    private WVTool wvt;
    private WVTConfiguration config;

    public TextInput(OperatorDescription operatorDescription) {
        super(operatorDescription);
        this.inputExampleSet = null;
        this.wvt = new WVTool(false);
    }

    protected abstract WVTInputList createInputList() throws OperatorException;

    protected abstract Attribute getLabel() throws OperatorException;

    protected abstract WVTConfiguration createConfiguration() throws OperatorException;

    protected abstract RapidMinerOutputFilter getOutputFilter(WVTWordList wVTWordList, Attribute attribute) throws OperatorException;

    private WVTWordList createWordList(WVTInputList wVTInputList, List<?> list, boolean z, int i) throws WVToolException, OperatorException {
        WVTWordList wVTWordList = list != null ? new WVTWordList(list, wVTInputList.getNumClasses()) : new WVTWordList(wVTInputList.getNumClasses());
        int i2 = 1;
        Iterator entries = wVTInputList.getEntries();
        while (entries.hasNext()) {
            this.wvt.addToWordList(wVTWordList, getTokenSequence((WVTDocumentInfo) entries.next()), z);
            if (i > 0 && i2 % i == 0) {
                wVTWordList.pruneByFrequency(2, Integer.MAX_VALUE);
            }
            i2++;
        }
        if (i2 <= 1) {
            throw new UserError(this, 932);
        }
        return wVTWordList;
    }

    private WVTTokenSequence getTokenSequence(WVTDocumentInfo wVTDocumentInfo) throws OperatorException {
        try {
            BufferedReader bufferedReader = new BufferedReader(this.wvt.getReader(wVTDocumentInfo, this.config));
            StringBuffer stringBuffer = new StringBuffer();
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                stringBuffer.append(readLine);
                stringBuffer.append(' ');
            }
            bufferedReader.close();
            IOContainer iOContainer = new IOContainer(new IOObject[]{new TokenSequence(stringBuffer.toString(), wVTDocumentInfo)});
            for (int i = 0; i < getNumberOfOperators(); i++) {
                iOContainer = getOperator(i).apply(iOContainer);
            }
            return iOContainer.get(TokenSequence.class);
        } catch (MissingIOObjectException e) {
            throw new UserError(this, e, 127, new Object[]{e});
        } catch (IOException e2) {
            throw new UserError(this, e2, 302, new Object[]{wVTDocumentInfo.getSourceName(), e2});
        } catch (WVToolException e3) {
            throw new UserError(this, e3, 905, new Object[]{"wvtool", e3});
        }
    }

    /* JADX WARN: Unsupported multi-entry loop pattern (BACK_EDGE: B:31:0x005c -> B:6:0x0075). Please report as a decompilation issue!!! */
    private void pruneWordList(WVTWordList wVTWordList) throws UndefinedParameterError {
        String parameter = getParameter(PARAMETER_PRUNE_BELOW);
        String parameter2 = getParameter(PARAMETER_PRUNE_ABOVE);
        int i = -1;
        int i2 = -1;
        try {
            i = parameter.charAt(parameter.length() - 1) == '%' ? wVTWordList.getFrequencyByRank((int) Math.max(wVTWordList.getNumWords() * (1.0d - (Double.parseDouble(parameter.substring(0, parameter.length() - 1)) / 100.0d)), 1.0d)) : Integer.parseInt(parameter);
        } catch (NumberFormatException e) {
            logError("Could not parse the parameter prune_below: " + e.getMessage());
        }
        try {
            i2 = parameter2.charAt(parameter2.length() - 1) == '%' ? wVTWordList.getFrequencyByRank((int) Math.max(wVTWordList.getNumWords() * (Double.parseDouble(parameter2.substring(0, parameter2.length() - 1)) / 100.0d), 1.0d)) : Integer.parseInt(parameter2);
        } catch (NumberFormatException e2) {
            logError("Could not parse the parameter prune_above: " + e2.getMessage());
        }
        if (i >= 0 || i2 >= 0) {
            log("Pruning word list.");
            wVTWordList.pruneByFrequency(i < 0 ? 0 : i, i2 < 0 ? Integer.MAX_VALUE : i2);
        }
    }

    public IOObject[] apply() throws OperatorException {
        AttributeWeights attributeWeights;
        WVTWordList createWordList;
        this.config = createConfiguration();
        WVToolLogger.setGlobalLogger(new WVToolRapidMinerLogger(this));
        if (getNumberOfOperators() == 0) {
            logWarning("There are no suboperators for this operator. This is usually not intended. You would probably like to at least add a tokenizer");
        }
        if (getInput().contains(ExampleSet.class)) {
            this.inputExampleSet = (ExampleSet) getInput(ExampleSet.class);
        }
        String parameterAsString = getParameterAsString(PARAMETER_TEXT_QUERY);
        if (parameterAsString != null) {
            try {
                this.config.setConfigurationRule("inputfilter", new WVTConfigurationFact(new ExtractingInputFilter(this.config, FeatureExtractionUtil.getExtractor(parameterAsString, FeatureExtractionUtil.getNamespaceMapping(getParameters())))));
            } catch (ExtractionException e) {
                UserError userError = e.getUserError();
                userError.setOperator(this);
                throw userError;
            }
        }
        WVTVectorCreator wVTVectorCreator = (WVTVectorCreator) this.vectorCreationMapper.getInstantiation(getParameterAsString(PARAMETER_VECTOR_CREATION));
        WVTInputList createInputList = createInputList();
        boolean z = true;
        try {
            try {
                attributeWeights = (AttributeWeights) getInput(AttributeWeights.class);
            } catch (WVToolException e2) {
                throw new UserError(this, e2, 905, new Object[]{"wvtool", e2});
            }
        } catch (Exception e3) {
            z = false;
            attributeWeights = null;
        }
        if (getParameterAsBoolean(PARAMETER_USE_GIVEN_WORD_LIST)) {
            createWordList = ((WordList) getInput(WordList.class)).getWordList();
        } else if (isParameterSet(PARAMETER_INPUT_WORD_LIST)) {
            if (z) {
                logWarning("Input attribute weights are ignored for word list loaded from file.");
            }
            File parameterAsFile = getParameterAsFile(PARAMETER_INPUT_WORD_LIST);
            try {
                createWordList = new WVTWordList(new BufferedReader(new FileReader(parameterAsFile)));
            } catch (IOException e4) {
                throw new UserError(this, 302, new Object[]{parameterAsFile, e4.getMessage()});
            }
        } else if (z) {
            LinkedList linkedList = new LinkedList();
            LinkedList linkedList2 = new LinkedList(attributeWeights.getAttributeNames());
            for (int i = 0; i < linkedList2.size(); i++) {
                String str = (String) linkedList2.get(i);
                if (attributeWeights.getWeight(str) > 0.0d) {
                    linkedList.add(str);
                }
            }
            createWordList = createWordList(createInputList, linkedList, false, getParameterAsInt(PARAMETER_ON_THE_FLY_PRUNING));
        } else {
            createWordList = createWordList(createInputList, null, true, getParameterAsInt(PARAMETER_ON_THE_FLY_PRUNING));
        }
        if (isParameterSet(PARAMETER_INPUT_WORD_LIST) || getParameterAsBoolean(PARAMETER_USE_GIVEN_WORD_LIST)) {
            log("Using external wordlist, no pruning is performed");
        } else {
            pruneWordList(createWordList);
        }
        RapidMinerOutputFilter outputFilter = getOutputFilter(createWordList, getLabel());
        Iterator entries = createInputList.getEntries();
        while (entries.hasNext()) {
            outputFilter.write(this.wvt.createVector(getTokenSequence((WVTDocumentInfo) entries.next()), wVTVectorCreator, createWordList));
        }
        if (getParameterAsBoolean(PARAMETER_CREATE_TEXT_VISUALIZER)) {
            ObjectVisualizerService.addObjectVisualizer(new TextVisualizer(createInputList, this.config, getParameterAsInt(PARAMETER_ID_ATTRIBUTE_TYPE)));
        }
        if (isParameterSet(PARAMETER_OUTPUT_WORD_LIST)) {
            File parameterAsFile2 = getParameterAsFile(PARAMETER_OUTPUT_WORD_LIST, true);
            try {
                createWordList.store(new FileWriter(parameterAsFile2));
            } catch (IOException e5) {
                throw new UserError(this, 303, new Object[]{parameterAsFile2, e5.getMessage()});
            }
        }
        ExampleSet createExampleSet = outputFilter.createExampleSet();
        outputFilter.cleanUp();
        return getParameterAsBoolean(PARAMETER_RETURN_WORD_LIST) ? new IOObject[]{createExampleSet, new WordList(createWordList)} : new IOObject[]{createExampleSet};
    }

    public Class<?>[] getInputClasses() {
        return new Class[0];
    }

    public Class<?>[] getOutputClasses() {
        return new Class[]{ExampleSet.class, WordList.class};
    }

    public InnerOperatorCondition getInnerOperatorCondition() {
        return new LastInnerOperatorCondition(new Class[]{TokenSequence.class}, new Class[]{TokenSequence.class}, true);
    }

    public int getMaxNumberOfInnerOperators() {
        return Integer.MAX_VALUE;
    }

    public int getMinNumberOfInnerOperators() {
        return 0;
    }

    public List<ParameterType> getParameterTypes() {
        List<ParameterType> parameterTypes = super.getParameterTypes();
        parameterTypes.add(new ParameterTypeString(PARAMETER_DEFAULT_CONTENT_TYPE, "The default content type if not specified by the example set  (possible values: pdf, html, htm, xml, text, txt).", ""));
        parameterTypes.add(new ParameterTypeString(PARAMETER_DEFAULT_CONTENT_ENCODING, "The default content encoding if not specified by the example set (only encodings supported by Java can be used).", ""));
        parameterTypes.add(new ParameterTypeString(PARAMETER_DEFAULT_CONTENT_LANGUAGE, "The default content language if not specified by the example set.", ""));
        parameterTypes.add(new ParameterTypeString(PARAMETER_PRUNE_BELOW, "Prune words that appear inat most that many documents. -1 for no pruning. Alternatively you can provide a percentage value, denoting the lowest document frequency in p words with the highest frequency.", "-1"));
        parameterTypes.add(new ParameterTypeString(PARAMETER_PRUNE_ABOVE, "Prune words that appear in at least that many documents. -1 for no pruning. Alternatively you can provide a percentage value, denoting the highest document frequency in p words with the lowest frequency.", "-1"));
        String[] strArr = new String[vectorCreation.length];
        for (int i = 0; i < vectorCreation.length; i++) {
            strArr[i] = vectorCreation[i].getCanonicalName();
        }
        this.vectorCreationMapper = new ClassNameMapper(strArr);
        ParameterTypeStringCategory parameterTypeStringCategory = new ParameterTypeStringCategory(PARAMETER_VECTOR_CREATION, "Method used to create word vectors", this.vectorCreationMapper.getShortClassNames(), "TFIDF");
        parameterTypeStringCategory.setExpert(false);
        parameterTypes.add(parameterTypeStringCategory);
        parameterTypes.add(new ParameterTypeBoolean(PARAMETER_USE_CONTENT_ATTRIBUTES, "If set to true, the returned example set will contain content type, encoding, and language attributes.", false));
        parameterTypes.add(new ParameterTypeBoolean(PARAMETER_USE_GIVEN_WORD_LIST, "If set, the given word of list in the input will be used", false));
        ParameterTypeFile parameterTypeFile = new ParameterTypeFile(PARAMETER_INPUT_WORD_LIST, "Load a word list from this file instead of creating it from the input data.", (String) null, true);
        parameterTypeFile.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_GIVEN_WORD_LIST, false, false));
        parameterTypes.add(parameterTypeFile);
        parameterTypes.add(new ParameterTypeBoolean(PARAMETER_RETURN_WORD_LIST, "If checked the word list will be returned as part of the result.", false));
        parameterTypes.add(new ParameterTypeFile(PARAMETER_OUTPUT_WORD_LIST, "Save the used word list into this file.", (String) null, true));
        parameterTypes.add(new ParameterTypeCategory(PARAMETER_ID_ATTRIBUTE_TYPE, "Indicates if long ids (complete paths), short ids (last part of the source name), or numerical ids will be used.", ExampleTableOutputFilter.ID_TYPE_NAMES, 2));
        parameterTypes.add(FeatureExtractionUtil.createNamespaceParameter());
        ParameterTypeString parameterTypeString = new ParameterTypeString(PARAMETER_TEXT_QUERY, "Query that extracts the parts of a document, that should be used for vectorization. This query can be XPath or a regular expression. If a regular expression is used, the query must have the following form: '<regex-expression> <replacement-pattern>', where the <replacement_pattern> states how a match is replaced to generate the final information. '$1' would yield the first matching group as result. For both, XPath and regular expression, all matches are concatanated and then passed to the vectorization process.");
        parameterTypeString.setExpert(true);
        parameterTypes.add(parameterTypeString);
        parameterTypes.add(new ParameterTypeBoolean(PARAMETER_CREATE_TEXT_VISUALIZER, "Indicates if a text specific object visualizer should be created which can be used in plotters etc. Note: Text visualization does not work for id type number.", false));
        parameterTypes.add(new ParameterTypeInt(PARAMETER_ON_THE_FLY_PRUNING, "Denotes after how many documents, singular terms should be removed from the word list. 0 indicates no pruning.", 0, Integer.MAX_VALUE, -1));
        return parameterTypes;
    }
}
