/*
 * Decompiled with CFR 0.152.
 */
package com.rapidminer.operator.crawler;

import Jama.Matrix;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.DoubleArrayDataRow;
import com.rapidminer.operator.crawler.StringMatchingRuleSet;
import com.rapidminer.operator.visualization.dependencies.NumericalMatrix;
import com.rapidminer.tools.LoggingHandler;
import com.rapidminer.tools.math.matrix.ExtendedMatrix;
import com.rapidminer.tools.math.matrix.ExtendedSparseMatrix;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import websphinx.Crawler;
import websphinx.Link;
import websphinx.Page;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class RapidMinerBasedCrawler
extends Crawler {
    private static final long serialVersionUID = -614344545462107732L;
    private final File outputDir;
    private final String extension;
    private String userAgent;
    private final int delay;
    private final Map<String, StringMatchingRuleSet> rules;
    private int currentId = 0;
    private final List<DataRow> dataRows = new LinkedList<DataRow>();
    private final Attribute pathAttribute = AttributeFactory.createAttribute("document_source", 1);
    private final Attribute urlAttribute = AttributeFactory.createAttribute("url", 1);
    public static final String[] knownProperties = new String[]{"visit_url", "visit_content", "follow_url", "link_text"};
    private final ExtendedMatrix<String, String> linkMatrix = new ExtendedSparseMatrix<String, String>();
    private final Map<String, String> idURLMap = new HashMap<String, String>();
    private LoggingHandler logger;

    public RapidMinerBasedCrawler(Map<String, StringMatchingRuleSet> rules, File outputDir, String extension, int delayMillis, String userAgent, LoggingHandler logger) {
        this.outputDir = outputDir;
        this.extension = extension;
        this.delay = delayMillis;
        this.rules = rules;
        this.logger = logger;
        this.userAgent = userAgent;
        int i = 0;
        while (i < knownProperties.length) {
            if (rules.get(knownProperties[i]) == null) {
                this.rules.put(knownProperties[i], new StringMatchingRuleSet());
            }
            ++i;
        }
    }

    public List<Attribute> getCrawlerExtractedAttributes() {
        ArrayList<Attribute> result = new ArrayList<Attribute>();
        result.add(this.urlAttribute);
        result.add(this.pathAttribute);
        return result;
    }

    public List<DataRow> getDataRows() {
        return this.dataRows;
    }

    private synchronized void writeLog(String s) {
        this.logger.log("Crawler : " + s);
    }

    private void addToExampleSet(String url, String fileName) {
        DoubleArrayDataRow row = new DoubleArrayDataRow(new double[2]);
        row.set(this.urlAttribute, this.urlAttribute.getMapping().mapString(url));
        row.set(this.pathAttribute, this.pathAttribute.getMapping().mapString(fileName));
        this.dataRows.add(row);
    }

    private synchronized void store(Page p) {
        BufferedWriter out = null;
        String fileName = String.valueOf(this.outputDir.getAbsolutePath()) + File.separator + this.currentId + "." + this.extension;
        try {
            this.addToExampleSet(p.getURL().toExternalForm(), fileName);
            out = new BufferedWriter(new FileWriter(fileName));
            out.write(p.getContent());
            out.close();
        }
        catch (IOException e) {
            this.logger.logError("Could not store file " + fileName);
        }
        this.addLinks(p, this.currentId);
        ++this.currentId;
    }

    private void addLinks(Page p, int id) {
        String source = p.getURL().toExternalForm();
        Link[] links = p.getLinks();
        int i = 0;
        while (i < links.length) {
            String target = links[i].getURL().toExternalForm();
            this.linkMatrix.setEntry(source, target, 1.0);
            ++i;
        }
        this.idURLMap.put(source, String.valueOf(id));
    }

    public NumericalMatrix getLinkMatrix() {
        int numberOfRows = this.linkMatrix.getNumXLabels();
        int numberOfColumns = this.linkMatrix.getNumYLabels();
        String[] rowNames = new String[numberOfRows];
        String[] columnNames = new String[numberOfColumns];
        int index = 0;
        Iterator<Object> it = this.linkMatrix.getXLabels();
        while (it.hasNext()) {
            String xLabel = (String)it.next();
            rowNames[index++] = xLabel;
        }
        index = 0;
        it = this.linkMatrix.getYLabels();
        while (it.hasNext()) {
            String yLabel = (String)it.next();
            columnNames[index++] = yLabel;
        }
        double[][] counts = new double[numberOfRows][numberOfColumns];
        int xIndex = 0;
        Iterator it2 = this.linkMatrix.getXLabels();
        while (it2.hasNext()) {
            String xLabel = (String)it2.next();
            int yIndex = 0;
            Iterator it22 = this.linkMatrix.getYLabels();
            while (it22.hasNext()) {
                String yLabel = (String)it22.next();
                String xId = xLabel;
                String yId = yLabel;
                if (xId != null && yId != null && this.idURLMap.get(xLabel) != null && this.idURLMap.get(yLabel) != null && this.linkMatrix.getEntry(xLabel, yLabel) > 0.0) {
                    counts[xIndex][yIndex] = 1.0;
                }
                ++yIndex;
            }
            ++xIndex;
        }
        NumericalMatrix result = new NumericalMatrix("Link", rowNames, columnNames, new Matrix(counts));
        result.setFirstAttributeName("From");
        result.setSecondAttributeName("To");
        return result;
    }

    public boolean shouldVisit(Link link) {
        String targetURL = link.getURL().toExternalForm();
        String linkText = link.toText();
        StringMatchingRuleSet urlRules = this.rules.get("follow_url");
        StringMatchingRuleSet textRules = this.rules.get("link_text");
        if (urlRules.check(targetURL) && textRules.check(linkText)) {
            this.writeLog("Following link to " + targetURL);
            return true;
        }
        this.writeLog("Not following link to " + targetURL);
        return false;
    }

    public void visit(Page p) {
        try {
            if (this.delay > 0) {
                Thread.sleep(this.delay);
            }
        }
        catch (InterruptedException interruptedException) {
            // empty catch block
        }
        if (this.rules.get("visit_url").check(p.getURL().toExternalForm()) && this.rules.get("visit_content").check(p.getContent())) {
            this.writeLog("Storing page " + p.getURL().toExternalForm());
            this.store(p);
        } else {
            this.writeLog("Did not store page " + p.getURL().toExternalForm());
        }
        p.discardContent();
    }

    public String toString() {
        return this.userAgent;
    }
}

