package com.rapidminer.operator;

import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.ListDataRowReader;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.gui.properties.PropertyTable;
import com.rapidminer.operator.crawler.CrawlerPolicyProperties;
import com.rapidminer.operator.crawler.ParameterTypeCrawlerPolicy;
import com.rapidminer.operator.crawler.RapidMinerBasedCrawler;
import com.rapidminer.operator.crawler.StringMatchingLiteral;
import com.rapidminer.operator.crawler.StringMatchingRuleSet;
import com.rapidminer.operator.visualization.dependencies.NumericalMatrix;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeDirectory;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeList;
import com.rapidminer.parameter.ParameterTypeString;
import java.awt.Component;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.StringTokenizer;
import javax.swing.JOptionPane;
import org.hibernate.annotations.common.reflection.XClass;
import org.hsqldb.Tokens;
import websphinx.Link;

/* loaded from: input_file:WEB-INF/lib/rapidminer-plugintext-1.0.0.jar:com/rapidminer/operator/CrawlerOperator.class */
public class CrawlerOperator extends Operator {
    public static final String PARAMETER_URL = "url";
    public static final String PARAMETER_CRAWLING_RULES = "crawling_rules";
    public static final String PARAMETER_MAX_DEPTH = "max_depth";
    public static final String PARAMETER_DELAY = "delay";
    public static final String PARAMETER_MAX_THREADS = "max_threads";
    public static final String PARAMETER_OUTPUT_DIR = "output_dir";
    public static final String PARAMETER_EXTENSION = "extension";
    public static final String PARAMETER_MAX_PAGE_SIZE = "max_page_size";
    public static final String PARAMETER_USER_AGENT = "user_agent";
    public static final String PARAMETER_OBEY_ROBOT_EXCLUSION = "obey_robot_exclusion";

    static {
        PropertyTable.registerPropertyKeyCellEditor(ParameterTypeCrawlerPolicy.class, CrawlerPolicyProperties.class);
    }

    public CrawlerOperator(OperatorDescription operatorDescription) {
        super(operatorDescription);
    }

    @Override // com.rapidminer.operator.Operator
    public IOObject[] apply() throws OperatorException {
        HashMap hashMap = new HashMap();
        for (String[] strArr : getParameterList(PARAMETER_CRAWLING_RULES)) {
            String str = strArr[0];
            String str2 = strArr[1];
            LinkedList linkedList = new LinkedList();
            StringTokenizer stringTokenizer = new StringTokenizer(str2, Example.SEPARATOR);
            while (stringTokenizer.hasMoreTokens()) {
                String nextToken = stringTokenizer.nextToken();
                boolean z = false;
                if (nextToken.charAt(0) == '-') {
                    z = true;
                    nextToken = nextToken.substring(1);
                }
                linkedList.add(new StringMatchingLiteral(nextToken, z));
            }
            StringMatchingRuleSet stringMatchingRuleSet = (StringMatchingRuleSet) hashMap.get(str);
            if (stringMatchingRuleSet == null) {
                stringMatchingRuleSet = new StringMatchingRuleSet();
                hashMap.put(str, stringMatchingRuleSet);
            }
            stringMatchingRuleSet.addConjunction(linkedList);
        }
        RapidMinerBasedCrawler rapidMinerBasedCrawler = new RapidMinerBasedCrawler(hashMap, getParameterAsFile(PARAMETER_OUTPUT_DIR, true), getParameterAsString(PARAMETER_EXTENSION), getParameterAsInt("delay"), getParameterAsString(PARAMETER_USER_AGENT), this);
        rapidMinerBasedCrawler.setDownloadParameters(rapidMinerBasedCrawler.getDownloadParameters().changeMaxThreads(getParameterAsInt(PARAMETER_MAX_THREADS)).changeObeyRobotExclusion(getParameterAsBoolean(PARAMETER_OBEY_ROBOT_EXCLUSION)).changeUserAgent(getParameterAsString(PARAMETER_USER_AGENT)).changeMaxPageSize(getParameterAsInt(PARAMETER_MAX_PAGE_SIZE)));
        if ((getParameterAsBoolean(PARAMETER_OBEY_ROBOT_EXCLUSION) ? -1 : JOptionPane.showConfirmDialog((Component) null, "You disabled the support for the robots.txt. Do this only if you know what you are doing and if you are sure not to violate any laws or terms of use. Do you wish to proceed?", "Warning: Disabled robot exclusion", 0, 2)) == 1) {
            logNote("Crawling operation aborded by the user");
            return new IOObject[0];
        }
        rapidMinerBasedCrawler.setMaxDepth(getParameterAsInt("max_depth"));
        MemoryExampleTable memoryExampleTable = new MemoryExampleTable(rapidMinerBasedCrawler.getCrawlerExtractedAttributes());
        String parameterAsString = getParameterAsString("url");
        try {
            rapidMinerBasedCrawler.setRoot(new Link(parameterAsString));
            rapidMinerBasedCrawler.run();
            memoryExampleTable.readExamples(new ListDataRowReader(rapidMinerBasedCrawler.getDataRows().iterator()));
            return new IOObject[]{memoryExampleTable.createExampleSet(new HashMap()), rapidMinerBasedCrawler.getLinkMatrix()};
        } catch (MalformedURLException e) {
            throw new UserError(this, Tokens.PREPARE, parameterAsString, e);
        }
    }

    @Override // com.rapidminer.operator.Operator
    public Class<?>[] getInputClasses() {
        return new Class[0];
    }

    @Override // com.rapidminer.operator.Operator
    public Class<?>[] getOutputClasses() {
        return new Class[]{ExampleSet.class, NumericalMatrix.class};
    }

    @Override // com.rapidminer.operator.Operator, com.rapidminer.parameter.ParameterHandler
    public List<ParameterType> getParameterTypes() {
        List<ParameterType> parameterTypes = super.getParameterTypes();
        ParameterTypeString parameterTypeString = new ParameterTypeString("url", "Specifies the url at which the crawler should start", false);
        parameterTypeString.setExpert(false);
        parameterTypes.add(parameterTypeString);
        ParameterTypeList parameterTypeList = new ParameterTypeList(PARAMETER_CRAWLING_RULES, "Specifies a set of rules that determine, which links to follow and which pages to process (see tutorial for details)", new ParameterTypeCrawlerPolicy(XClass.ACCESS_PROPERTY, "the value of the property"));
        parameterTypeList.setExpert(false);
        parameterTypes.add(parameterTypeList);
        ParameterTypeInt parameterTypeInt = new ParameterTypeInt("max_depth", "Specifies the maximal depth of the crawling process", 0, Integer.MAX_VALUE, 2);
        parameterTypeInt.setExpert(false);
        parameterTypes.add(parameterTypeInt);
        ParameterTypeInt parameterTypeInt2 = new ParameterTypeInt("delay", "Specifies the delay when vistiting a page in milleseconds", 0, Integer.MAX_VALUE, 1000);
        parameterTypeInt2.setExpert(false);
        parameterTypes.add(parameterTypeInt2);
        ParameterTypeInt parameterTypeInt3 = new ParameterTypeInt(PARAMETER_MAX_THREADS, "Specifies the number of crawling threads working in parallel", 1, Integer.MAX_VALUE, 1);
        parameterTypeInt3.setExpert(true);
        parameterTypes.add(parameterTypeInt3);
        ParameterTypeDirectory parameterTypeDirectory = new ParameterTypeDirectory(PARAMETER_OUTPUT_DIR, "Specifies the directory to which to write the files", false);
        parameterTypeDirectory.setExpert(false);
        parameterTypes.add(parameterTypeDirectory);
        ParameterTypeString parameterTypeString2 = new ParameterTypeString(PARAMETER_EXTENSION, "Specifies the extension of the stored files", "txt");
        parameterTypeString2.setExpert(false);
        parameterTypes.add(parameterTypeString2);
        ParameterTypeInt parameterTypeInt4 = new ParameterTypeInt(PARAMETER_MAX_PAGE_SIZE, "Specifies the maximum page size (in KB): pages larger than this limit are not downloaded", 1, Integer.MAX_VALUE, 100);
        parameterTypeInt4.setExpert(false);
        parameterTypes.add(parameterTypeInt4);
        ParameterTypeString parameterTypeString3 = new ParameterTypeString(PARAMETER_USER_AGENT, "The identity the crawler uses while accessing a server", "rapid-miner-crawler");
        parameterTypeString3.setExpert(true);
        parameterTypes.add(parameterTypeString3);
        ParameterTypeBoolean parameterTypeBoolean = new ParameterTypeBoolean(PARAMETER_OBEY_ROBOT_EXCLUSION, "Specifies whether the crawler obeys the rules, which pages on site might be visited by a robot. Disable only if you know what you are doing and if you a sure not to violate any existing laws by doing so", true);
        parameterTypeBoolean.setExpert(true);
        parameterTypes.add(parameterTypeBoolean);
        return parameterTypes;
    }
}
