package net.matuschek.spider;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.StringTokenizer;
import java.util.Vector;
import net.matuschek.html.FormFiller;
import net.matuschek.html.HtmlDocument;
import net.matuschek.http.DocManagerException;
import net.matuschek.http.DownloadRuleSet;
import net.matuschek.http.ExtendedURL;
import net.matuschek.http.HttpDoc;
import net.matuschek.http.HttpDocManager;
import net.matuschek.http.HttpException;
import net.matuschek.http.HttpHeader;
import net.matuschek.http.HttpTool;
import net.matuschek.http.HttpToolCallback;
import net.matuschek.http.NTLMAuthorization;
import net.matuschek.http.cookie.CookieManager;
import net.matuschek.spider.docfilter.FilterChain;
import net.matuschek.spider.docfilter.FilterException;
import org.apache.commons.configuration.tree.DefaultExpressionEngine;
import org.apache.commons.fileupload.FileUploadBase;
import org.apache.log4j.Category;
import org.apache.log4j.spi.LocationInfo;
import org.apache.xpath.XPath;
import org.archive.io.warc.WARCConstants;
import org.archive.net.UURIFactory;
import org.hsqldb.DatabaseURL;
import org.springframework.beans.factory.BeanFactory;
import org.w3c.dom.Element;

/* loaded from: input_file:WEB-INF/lib/jobo-1.4.0.jar:net/matuschek/spider/WebRobot.class */
public class WebRobot implements Runnable, Cloneable {
    private static final String ROBOT_NAME = "JoBo";
    private static final String AGENT_NAME = "JoBo/1.4 (http://www.matuschek.net/jobo.html)";
    protected RobotExceptionHandler exceptionHandler;
    private static final int DEFAULT_DEPTH = 10;
    protected URL startURL;
    protected String startDir;
    protected int maxDepth;
    protected boolean walkToOtherHosts;
    protected HttpDocManager docManager;
    protected HttpTool httpTool;
    protected Category log;
    protected String startReferer;
    protected NoRobots robCheck;
    protected TaskList todo;
    protected TaskList visited;
    protected boolean ignoreRobotsTxt;
    protected int sleepTime;
    protected FormFiller formFiller;
    protected Vector visitMany;
    protected WebRobotCallback webRobotCallback;
    protected boolean stopIt;
    protected URLCheck urlCheck;
    protected boolean sleep;
    protected Vector allowedURLs;
    protected boolean allowWholeHost;
    protected long maxDocumentAge;
    protected boolean allowWholeDomain;
    protected boolean flexibleHostCheck;
    protected FilterChain filters;
    protected boolean allowCaching;
    protected boolean duplicateCheck;
    private int memoryLevel;
    protected boolean activatedNewTasks;
    protected boolean activatedUrlHistory;
    protected boolean activatedContentHistory;
    private byte[] memoryBuffer;
    protected int iteration;
    private static final int DEFAULT_EXPECTED_DOCUMENT_COUNT = 50000;
    protected int expectedDocumentCount;
    protected HashMap content2UrlMap;
    long countCache;
    long countWeb;
    long countNoRefresh;
    long countRefresh;
    boolean hasFormHandlers;
    protected Vector wasteParameters;
    protected long startTime;
    protected int maxRetries;
    protected long expirationAge;

    public WebRobot(int i) {
        this.exceptionHandler = new DefaultRobotExceptionHandler();
        this.startURL = null;
        this.startDir = "";
        this.maxDepth = 10;
        this.walkToOtherHosts = false;
        this.httpTool = new HttpTool();
        this.startReferer = "-";
        this.todo = null;
        this.visited = null;
        this.ignoreRobotsTxt = false;
        this.sleepTime = 1;
        this.formFiller = new FormFiller();
        this.visitMany = new Vector();
        this.webRobotCallback = null;
        this.stopIt = false;
        this.urlCheck = null;
        this.allowedURLs = new Vector();
        this.allowWholeHost = true;
        this.maxDocumentAge = -1L;
        this.allowWholeDomain = true;
        this.flexibleHostCheck = false;
        this.filters = null;
        this.allowCaching = true;
        this.duplicateCheck = false;
        this.memoryLevel = 0;
        this.activatedNewTasks = true;
        this.activatedUrlHistory = true;
        this.activatedContentHistory = true;
        this.memoryBuffer = new byte[204800];
        this.iteration = 0;
        this.expectedDocumentCount = 50000;
        this.countCache = 0L;
        this.countWeb = 0L;
        this.countNoRefresh = 0L;
        this.countRefresh = 0L;
        this.hasFormHandlers = false;
        this.wasteParameters = new Vector();
        this.startTime = System.currentTimeMillis();
        this.maxRetries = 0;
        this.expirationAge = -1L;
        this.log = Category.getInstance(getClass().getName());
        this.content2UrlMap = new HashMap(i);
        registerVisitedList(new HashedMemoryTaskList(false, i));
        registerToDoList(new HashedMemoryTaskList(true, i));
        this.expectedDocumentCount = i;
        setAgentName(AGENT_NAME);
    }

    public WebRobot() {
        this(50000);
    }

    public void registerToDoList(TaskList taskList) {
        this.todo = taskList;
    }

    public void registerVisitedList(TaskList taskList) {
        this.visited = taskList;
    }

    public URL getStartURL() {
        return this.startURL;
    }

    public void setStartURL(URL url) {
        String path = url.getPath();
        this.startURL = url;
        if (path.endsWith("/")) {
            this.startDir = url.getHost() + path;
            return;
        }
        int lastIndexOf = path.lastIndexOf("/");
        if (lastIndexOf < 0) {
            this.startDir = url.getHost() + "/";
        } else {
            this.startDir = url.getHost() + path.substring(0, lastIndexOf + 1);
        }
    }

    public int getMaxDepth() {
        return this.maxDepth;
    }

    public void setMaxDepth(int i) {
        this.maxDepth = i;
    }

    public int getBandwidth() {
        return this.httpTool.getBandwidth();
    }

    public void setBandwidth(int i) {
        this.httpTool.setBandwidth(i);
    }

    public boolean getWalkToOtherHosts() {
        return this.walkToOtherHosts;
    }

    public void setWalkToOtherHosts(boolean z) {
        this.walkToOtherHosts = z;
    }

    public boolean getAllowWholeHost() {
        return this.allowWholeHost;
    }

    public void setAllowWholeHost(boolean z) {
        this.allowWholeHost = z;
    }

    public boolean getAllowWholeDomain() {
        return this.allowWholeDomain;
    }

    public void setAllowWholeDomain(boolean z) {
        this.allowWholeDomain = z;
    }

    public boolean getFlexibleHostCheck() {
        return this.flexibleHostCheck;
    }

    public void setFlexibleHostCheck(boolean z) {
        this.flexibleHostCheck = z;
    }

    public boolean getAllowCaching() {
        return this.allowCaching;
    }

    public void setAllowCaching(boolean z) {
        this.allowCaching = z;
    }

    public HttpDocManager getDocManager() {
        return this.docManager;
    }

    public void setDocManager(HttpDocManager httpDocManager) {
        this.docManager = httpDocManager;
    }

    public void setCookieManager(CookieManager cookieManager) {
        this.httpTool.setCookieManager(cookieManager);
    }

    public CookieManager getCookieManager() {
        return this.httpTool.getCookieManager();
    }

    public void setDownloadRuleSet(DownloadRuleSet downloadRuleSet) {
        this.httpTool.setDownloadRuleSet(downloadRuleSet);
    }

    public void setURLCheck(URLCheck uRLCheck) {
        this.urlCheck = uRLCheck;
    }

    public void setProxy(String str) throws HttpException {
        this.httpTool.setProxy(str);
    }

    public String getProxy() {
        return this.httpTool.getProxy();
    }

    public String getStartReferer() {
        return this.startReferer;
    }

    public void setStartReferer(String str) {
        this.startReferer = str;
    }

    public void setIgnoreRobotsTxt(boolean z) {
        this.robCheck.setIgnore(z);
    }

    public int getSleepTime() {
        return this.sleepTime;
    }

    public void setSleepTime(int i) {
        this.sleepTime = i;
    }

    public void setFromAddress(String str) {
        this.httpTool.setFromAddress(str);
    }

    public void setFormHandlers(Vector vector) {
        this.formFiller.setFormHandlers(vector);
        if (vector == null || vector.size() <= 0) {
            return;
        }
        this.hasFormHandlers = true;
    }

    public Vector getFormHandlers() {
        return this.formFiller.getFormHandlers();
    }

    public String getAgentName() {
        if (this.httpTool != null) {
            return this.httpTool.getAgentName();
        }
        return null;
    }

    public void setAgentName(String str) {
        this.httpTool.setAgentName(str);
        this.robCheck = new NoRobots(str, this.httpTool);
    }

    public int getTimeout() {
        if (this.httpTool != null) {
            return this.httpTool.getTimeout();
        }
        return -1;
    }

    public void setTimeout(int i) {
        this.httpTool.setTimeout(i);
    }

    public NTLMAuthorization getNtlmAuthorization() {
        if (this.httpTool != null) {
            return this.httpTool.getNtlmAuthorization();
        }
        return null;
    }

    public void setNtlmAuthorization(NTLMAuthorization nTLMAuthorization) {
        this.httpTool.setNtlmAuthorization(nTLMAuthorization);
    }

    public boolean getIgnoreRobotsTxt() {
        return this.ignoreRobotsTxt;
    }

    public Vector getVisitMany() {
        return this.visitMany;
    }

    public void setVisitMany(Vector vector) {
        this.visitMany = vector;
    }

    public void setHttpToolCallback(HttpToolCallback httpToolCallback) {
        this.httpTool.setCallback(httpToolCallback);
    }

    public WebRobotCallback getWebRobotCallback() {
        return this.webRobotCallback;
    }

    public void setWebRobotCallback(WebRobotCallback webRobotCallback) {
        this.webRobotCallback = webRobotCallback;
    }

    public void setSleep(boolean z) {
        this.sleep = z;
    }

    public boolean isSleeping() {
        return this.sleep;
    }

    public void setAllowedURLs(Vector vector) {
        this.allowedURLs = vector;
    }

    public Vector getAllowedURLs() {
        return this.allowedURLs;
    }

    public void setEnableCookies(boolean z) {
        this.httpTool.setEnableCookies(z);
    }

    public boolean getEnableCookies() {
        return this.httpTool.getEnableCookies();
    }

    public void setMaxDocumentAge(long j) {
        this.maxDocumentAge = j;
    }

    public long getMaxDocumentAge() {
        return this.maxDocumentAge;
    }

    public void setFilters(FilterChain filterChain) {
        this.filters = filterChain;
    }

    public void clearCookies() {
        this.httpTool.clearCookies();
    }

    @Override // java.lang.Runnable
    public void run() {
        work();
    }

    public void work() {
        this.todo.add(createRobotTask(this.startURL, this.maxDepth, this.startReferer));
        walkTree();
        cleanUp();
        this.log.info("Documents retrieved by: Web=" + this.countWeb + " Cache=" + this.countCache + " Refresh=" + this.countRefresh + " NoRefresh=" + this.countNoRefresh);
    }

    public void stopRobot() {
        this.stopIt = true;
    }

    public void walkTree() {
        while (this.todo.size() > 0 && !this.stopIt) {
            synchronized (this.visited) {
                RobotTask removeFirst = this.todo.removeFirst();
                if (!this.visited.contains(removeFirst) || this.visitMany.contains(removeFirst.getUrl().toString())) {
                    if (this.activatedUrlHistory) {
                        this.visited.add(removeFirst);
                    }
                    boolean z = true;
                    while (z) {
                        try {
                            retrieveURL(removeFirst);
                            z = false;
                        } catch (OutOfMemoryError e) {
                            handleMemoryError(e);
                        }
                    }
                    while (this.sleep) {
                        if (this.webRobotCallback != null) {
                            this.webRobotCallback.webRobotSleeping(true);
                        }
                        try {
                            Thread.sleep(1000L);
                        } catch (InterruptedException e2) {
                        }
                    }
                    if (this.webRobotCallback != null) {
                        this.webRobotCallback.webRobotSleeping(false);
                    }
                    if (this.webRobotCallback != null) {
                        this.webRobotCallback.webRobotUpdateQueueStatus(this.todo.size());
                    }
                    spawnThread();
                } else {
                    this.log.debug("already visited: " + removeFirst.getUrl());
                }
            }
        }
        if (this.webRobotCallback != null) {
            finishThreads();
        }
    }

    protected void handleMemoryError(OutOfMemoryError outOfMemoryError) throws OutOfMemoryError {
        this.memoryLevel++;
        this.log.error("OutOfMemoryError level=" + this.memoryLevel + "! (visited=" + this.visited.size() + ", todo=" + this.todo.size() + DefaultExpressionEngine.DEFAULT_INDEX_END);
        switch (this.memoryLevel) {
            case 1:
                this.visited.clear();
                this.activatedUrlHistory = false;
                this.content2UrlMap.clear();
                this.activatedContentHistory = false;
                System.gc();
                return;
            case 2:
                this.activatedNewTasks = false;
                this.memoryBuffer = null;
                System.gc();
                return;
            case 3:
                throw outOfMemoryError;
            default:
                if (this.memoryBuffer != null) {
                    System.err.println((int) this.memoryBuffer[0]);
                }
                throw outOfMemoryError;
        }
    }

    protected void finishThreads() {
        this.webRobotCallback.webRobotDone();
        if (this.docManager != null) {
            this.docManager.finish();
        }
    }

    protected synchronized void spawnThread() {
    }

    public void retrieveURL(RobotTask robotTask) {
        HtmlDocument htmlDocument;
        HttpDoc httpDoc;
        if (robotTask == null) {
            this.log.debug("Empty task found, ignoring");
            return;
        }
        long currentTimeMillis = System.currentTimeMillis();
        updateProgressInfo();
        URL url = robotTask.getUrl();
        String url2 = url.toString();
        String referer = robotTask.getReferer();
        int maxDepth = robotTask.getMaxDepth();
        if (maxDepth < 0) {
            this.log.info("Max search depth reached");
            return;
        }
        if (!isAllowed(url)) {
            this.log.info("Url '" + url + "' filtered out.");
            return;
        }
        if (url.getFile().equals("")) {
            try {
                url2 = url2 + "/";
                url = new URL(url2);
                robotTask.setUrl(url);
            } catch (MalformedURLException e) {
                this.log.error("URL not well formed: " + e.toString());
                this.exceptionHandler.handleException(this, url, e);
                return;
            }
        }
        this.log.info("retrieving " + url2);
        this.httpTool.setReferer(referer);
        HttpDoc httpDoc2 = null;
        Vector<URL> vector = null;
        boolean z = false;
        boolean z2 = true;
        if (this.docManager != null && this.allowCaching && robotTask.getMethod() == 1 && robotTask.getParamString() == null) {
            httpDoc2 = this.docManager.retrieveFromCache(url);
            if (httpDoc2 != null) {
                this.countCache++;
                double dateAsMilliSeconds = (currentTimeMillis - httpDoc2.getDateAsMilliSeconds()) / 1000;
                if (dateAsMilliSeconds < XPath.MATCH_SCORE_QNAME) {
                    this.log.warn("DocumentAge < 0!");
                }
                z2 = this.maxDocumentAge >= 0 && dateAsMilliSeconds > ((double) this.maxDocumentAge);
                if (z2) {
                    this.httpTool.setIfModifiedSince(new Date(httpDoc2.getLastModifiedAsMilliSeconds()));
                }
            } else {
                this.httpTool.setIfModifiedSince(null);
            }
        }
        if (z2) {
            boolean z3 = false;
            try {
                if (url.getProtocol().equalsIgnoreCase("file")) {
                    httpDoc = retrieveFileURL(url, this.httpTool.getIfModifiedSince());
                } else {
                    httpDoc = this.httpTool.retrieveDocument(url, robotTask.getMethod(), robotTask.getParamString());
                    if (httpDoc != null) {
                        httpDoc.setDate(currentTimeMillis);
                    }
                    sleepNow();
                }
                if (httpDoc == null || httpDoc.isNotModified()) {
                    if (httpDoc2 != null) {
                        httpDoc2.setDate(currentTimeMillis);
                        httpDoc2.setCached(false);
                        httpDoc = null;
                    }
                } else if (!httpDoc.isOk() && !httpDoc.isRedirect()) {
                    z3 = true;
                }
            } catch (HttpException e2) {
                z3 = true;
                httpDoc = null;
            }
            if (z3) {
                int retry = robotTask.retry();
                if (retry <= this.maxRetries) {
                    synchronized (this.visited) {
                        this.todo.add(robotTask);
                        this.visited.remove(robotTask);
                    }
                    this.log.info("Adding " + url + " for retry no. " + retry);
                    return;
                }
                httpDoc2 = this.docManager.retrieveFromCache(url);
                if (httpDoc2 == null) {
                    this.log.warn("Unsuccessfull retries for " + url);
                    return;
                }
                long dateAsMilliSeconds2 = (currentTimeMillis - httpDoc2.getDateAsMilliSeconds()) / 1000;
                if (this.expirationAge >= 0 && dateAsMilliSeconds2 >= this.expirationAge) {
                    this.log.warn("Cached document expired: " + url);
                    this.docManager.removeDocument(url);
                    return;
                } else {
                    httpDoc = httpDoc2;
                    z = true;
                    this.log.info("Cached document not expired: " + url);
                }
            }
            if (httpDoc != null) {
                this.countWeb++;
                httpDoc2 = httpDoc;
                vector = null;
                this.countRefresh++;
            } else {
                z = true;
                this.countNoRefresh++;
            }
        } else {
            z = true;
            this.log.debug("Page " + url + " retrieved from cache");
        }
        if (httpDoc2 == null) {
            this.log.info("not downloaded " + url);
            return;
        }
        String str = null;
        if (this.duplicateCheck) {
            str = getContentVisitedURL(httpDoc2);
            if (str != null) {
                this.log.info("URLs with same content found: " + url2 + " = " + str);
            } else {
                try {
                    str = this.docManager.findDuplicate(httpDoc2);
                    if (str != null) {
                        this.log.info("URLs with same content found in cache: " + url2 + " = " + str);
                    }
                } catch (IOException e3) {
                    e3.printStackTrace();
                }
            }
            if (str != null) {
                if (!removeParameters(url2).equals(removeParameters(str)) && !z) {
                    try {
                        HttpDoc retrieveFromCache = this.docManager.retrieveFromCache(new URL(str));
                        if (retrieveFromCache != null) {
                            httpDoc2.setLinks(retrieveFromCache.getLinks());
                        }
                        this.docManager.storeDocument(httpDoc2);
                    } catch (Exception e4) {
                        e4.printStackTrace();
                    }
                }
                try {
                    RobotTask createRobotTask = createRobotTask(new URL(str), maxDepth, referer);
                    if (!this.visited.contains(createRobotTask)) {
                        addTask(createRobotTask);
                    }
                    return;
                } catch (MalformedURLException e5) {
                    e5.printStackTrace();
                    return;
                }
            }
        }
        if (httpDoc2.isUnauthorized()) {
            this.log.info("got HTTP Unauthorized for URL " + url);
        }
        if (!httpDoc2.isOk() && !z) {
            if (httpDoc2.isRedirect()) {
                String location = httpDoc2.getLocation();
                this.log.info("Got redirect to " + location);
                try {
                    addTaskAtStart(createRobotTask(new URL(url, location), maxDepth - 1, referer));
                    return;
                } catch (MalformedURLException e6) {
                    return;
                }
            }
            if (httpDoc2.isNotFound()) {
                this.exceptionHandler.handleException(this, url, new HttpException("Document not found"));
                return;
            } else if (httpDoc2.isUnauthorized()) {
                this.exceptionHandler.handleException(this, url, new HttpException("No authorization for the document."));
                return;
            } else {
                this.exceptionHandler.handleException(this, url, new HttpException("Unknown document error (Http return code " + httpDoc2.getHttpCode() + ")."));
                return;
            }
        }
        if (this.webRobotCallback != null) {
            this.webRobotCallback.webRobotRetrievedDoc(url2, httpDoc2.getContent() != null ? httpDoc2.getContent().length : 0);
        }
        try {
            if (httpDoc2.isHTML() && maxDepth > 0) {
                HttpHeader header = httpDoc2.getHeader(FileUploadBase.CONTENT_TYPE);
                if (header != null) {
                    String value = header.getValue();
                    int indexOf = value.toLowerCase().indexOf("charset=");
                    htmlDocument = indexOf > 0 ? new HtmlDocument(url, httpDoc2.getContent(), value.substring(indexOf + 8)) : new HtmlDocument(url, httpDoc2.getContent());
                } else {
                    htmlDocument = new HtmlDocument(url, httpDoc2.getContent());
                }
                if (maxDepth > 0) {
                    if (str != null) {
                        httpDoc2.setLinks(this.docManager.retrieveFromCache(new URL(str)).getLinks());
                    } else if (z) {
                    }
                    if (vector == null) {
                        vector = htmlDocument.getLinks();
                        httpDoc2.setLinks(vector);
                    }
                    if (str == null) {
                        HashSet hashSet = new HashSet();
                        for (int i = 0; i < vector.size(); i++) {
                            URL elementAt = vector.elementAt(i);
                            this.log.info("Link: " + elementAt);
                            if (!hashSet.contains(elementAt)) {
                                hashSet.add(elementAt);
                                String url3 = url.toString();
                                if (url.getUserInfo() != null) {
                                    url3 = DatabaseURL.S_HTTP + url3.substring(url3.indexOf("@") + 1);
                                }
                                RobotTask createRobotTask2 = createRobotTask(vector.elementAt(i), maxDepth - 1, url3);
                                if (!this.visited.contains(createRobotTask2)) {
                                    if (createRobotTask2.urlString.endsWith(".jpg")) {
                                        addTaskAtStart(createRobotTask2);
                                    } else {
                                        addTask(createRobotTask2);
                                    }
                                }
                            }
                        }
                    }
                }
                if (this.hasFormHandlers) {
                    Vector elements = htmlDocument.getElements("form");
                    for (int i2 = 0; i2 < elements.size(); i2++) {
                        ExtendedURL fillForm = this.formFiller.fillForm(url, (Element) elements.elementAt(i2));
                        if (fillForm != null) {
                            RobotTask createRobotTask3 = createRobotTask(fillForm.getURL(), maxDepth - 1, url.toString());
                            createRobotTask3.setParamString(fillForm.getParams());
                            createRobotTask3.setMethod(fillForm.getRequestMethod());
                            addTask(createRobotTask3);
                        }
                    }
                }
            }
        } catch (OutOfMemoryError e7) {
            throw e7;
        } catch (Throwable th) {
            this.log.error("Unexpected error while extraction links from url '" + url + "':" + th);
            th.printStackTrace();
        }
        if (this.docManager != null) {
            try {
                if (this.filters != null) {
                    httpDoc2 = this.filters.process(httpDoc2);
                } else {
                    this.log.debug("No filters defined");
                }
                if (isProcessingAllowed(httpDoc2)) {
                    this.docManager.processDocument(httpDoc2);
                } else {
                    String headerValue = httpDoc2.getHeaderValue("Content-MD5");
                    httpDoc2.setContent("Not for indexing".getBytes());
                    httpDoc2.setHeaderValue("Content-MD5", headerValue);
                }
                try {
                    this.docManager.storeDocument(httpDoc2);
                } catch (Exception e8) {
                    this.log.warn("could not store (not for indexing) " + url2 + WARCConstants.COLON_SPACE + e8.getMessage());
                }
                if (this.activatedContentHistory && str == null) {
                    setContentVisitedURL(httpDoc2, url2);
                }
            } catch (DocManagerException e9) {
                this.log.error("could not process document: " + e9.getMessage());
                this.exceptionHandler.handleException(this, url, e9);
            } catch (FilterException e10) {
                this.log.error(e10.getMessage());
            }
        }
    }

    public void updateProgressInfo() {
    }

    public void sleepNow() {
        if (this.sleepTime > 0) {
            synchronized (this) {
                if (this.webRobotCallback != null) {
                    this.webRobotCallback.webRobotSleeping(true);
                }
                try {
                    Thread.sleep(this.sleepTime * 1000);
                } catch (InterruptedException e) {
                }
                if (this.webRobotCallback != null) {
                    this.webRobotCallback.webRobotSleeping(false);
                }
            }
        }
    }

    private HttpDoc retrieveFileURL(URL url, Date date) throws HttpException {
        HttpDoc httpDoc = new HttpDoc();
        try {
            String host = url.getHost();
            String file = url.getFile();
            if (host != null && !host.equals("")) {
                file = "//" + host + file;
            } else if (file.startsWith(UURIFactory.BACKSLASH) || file.startsWith("/")) {
                file = file.substring(1);
            }
            String mimeTypeForFilename = getMimeTypeForFilename(file);
            if (mimeTypeForFilename != null) {
                httpDoc.addHeader(new HttpHeader("content-type", mimeTypeForFilename));
            }
            File file2 = new File(file);
            if (!file2.exists()) {
                httpDoc.setHttpCode("httpcode 404");
                return httpDoc;
            }
            long lastModified = file2.lastModified();
            if (lastModified > (date == null ? 0L : date.getTime())) {
                httpDoc.setContent(readFileToByteArray(file2));
                httpDoc.setHttpCode("httpcode 200");
            } else {
                httpDoc.setHttpCode("httpcode 304");
            }
            httpDoc.setLastModified(lastModified);
            httpDoc.setDate(System.currentTimeMillis());
            httpDoc.setURL(url);
            return httpDoc;
        } catch (Exception e) {
            throw new HttpException(e.getMessage());
        }
    }

    protected String getMimeTypeForFilename(String str) {
        if (str.endsWith(".html") || str.endsWith(".htm")) {
            return "text/html";
        }
        return null;
    }

    protected void cleanUp() {
        this.stopIt = false;
        this.visited.clear();
        this.todo.clear();
    }

    protected void addTask(RobotTask robotTask) {
        if (taskAddAllowed(robotTask) && this.activatedNewTasks) {
            this.todo.add(robotTask);
        }
    }

    protected void addTaskAtStart(RobotTask robotTask) {
        if (taskAddAllowed(robotTask) && this.activatedNewTasks) {
            this.todo.addAtStart(robotTask);
        }
    }

    protected boolean taskAddAllowed(RobotTask robotTask) {
        if (robotTask != null) {
            return isAllowed(robotTask.getUrl()) && !this.todo.contains(robotTask);
        }
        this.log.info("Null task not allowed");
        return false;
    }

    protected boolean isAllowed(URL url) {
        if (!basicURLCheck(url)) {
            return false;
        }
        if (this.urlCheck != null && !this.urlCheck.checkURL(url)) {
            this.log.debug("not allowed by URLCheck:" + url);
            return false;
        }
        if (this.robCheck.ok(url)) {
            return true;
        }
        this.log.debug("not allowed by robots.txt:" + url);
        return false;
    }

    protected boolean isProcessingAllowed(HttpDoc httpDoc) {
        URL url = httpDoc.getURL();
        if (this.urlCheck != null && !this.urlCheck.checkURLForProcessing(url)) {
            this.log.debug("processing not allowed by URLCheck:" + url);
            return false;
        }
        DownloadRuleSet downloadRuleSet = this.httpTool.getDownloadRuleSet();
        if (downloadRuleSet == null || downloadRuleSet.processAllowed(httpDoc.getHttpHeaders())) {
            return true;
        }
        this.log.debug("processing not allowed by DownloadRuleSet:" + url);
        return false;
    }

    protected boolean basicURLCheck(URL url) {
        String str = url.getHost() + url.getPath();
        String lowerCase = url.getHost().toLowerCase();
        String lowerCase2 = this.startURL.getHost().toLowerCase();
        if (this.walkToOtherHosts || str.startsWith(this.startDir)) {
            return true;
        }
        if (this.allowWholeHost && url.getHost().equalsIgnoreCase(this.startURL.getHost())) {
            return true;
        }
        if (this.flexibleHostCheck && cutWWW(lowerCase).equalsIgnoreCase(cutWWW(lowerCase2))) {
            return true;
        }
        if (this.allowWholeDomain && lowerCase.endsWith(getDomain(lowerCase2))) {
            return true;
        }
        for (int i = 0; i < this.allowedURLs.size(); i++) {
            if (str.startsWith((String) this.allowedURLs.elementAt(i))) {
                return true;
            }
        }
        this.log.debug("URL " + str + " not allowed");
        return false;
    }

    private String cutWWW(String str) {
        return str.toLowerCase().startsWith("www.") ? str.substring(4) : str;
    }

    private String getDomain(String str) {
        int indexOf = str.indexOf(".");
        return indexOf < 0 ? str : str.substring(indexOf + 1);
    }

    public RobotExceptionHandler getExceptionHandler() {
        return this.exceptionHandler;
    }

    public void setExceptionHandler(RobotExceptionHandler robotExceptionHandler) {
        if (robotExceptionHandler != null) {
            this.exceptionHandler = robotExceptionHandler;
        }
    }

    public void setStart(String str) {
        try {
            setStartURL(new URL(str));
        } catch (MalformedURLException e) {
            e.printStackTrace();
        }
    }

    public String getStart() {
        URL startURL = getStartURL();
        if (startURL != null) {
            return startURL.toExternalForm();
        }
        return null;
    }

    public void finish() {
        if (this.httpTool != null) {
            this.httpTool.finish();
        }
        if (this.robCheck != null) {
            this.robCheck.finish();
        }
        if (this.docManager != null) {
            this.docManager.finish();
        }
    }

    public static void main(String[] strArr) {
        if (strArr.length > 0) {
            System.err.println("Arguments will be ignored!");
        }
        Field[] declaredFields = WebRobot.class.getDeclaredFields();
        StringBuffer stringBuffer = new StringBuffer(60);
        for (int i = 0; i < declaredFields.length; i++) {
            if (!Modifier.isFinal(declaredFields[i].getModifiers()) && !Modifier.isStatic(declaredFields[i].getModifiers())) {
                stringBuffer.delete(0, stringBuffer.length());
                stringBuffer.append("\t\trobot." + declaredFields[i].getName() + " = " + declaredFields[i].getName() + ";");
                while (stringBuffer.length() < 50) {
                    stringBuffer.append(" ");
                }
                System.out.println(stringBuffer.toString() + "// (" + declaredFields[i].getType().getName() + DefaultExpressionEngine.DEFAULT_INDEX_END);
            }
        }
    }

    public String getContentVisitedURL(HttpDoc httpDoc) {
        String str;
        String contentMD5 = httpDoc.getContentMD5();
        synchronized (this.content2UrlMap) {
            str = (String) this.content2UrlMap.get(contentMD5);
        }
        return str;
    }

    public void setContentVisitedURL(HttpDoc httpDoc, String str) {
        String contentMD5 = httpDoc.getContentMD5();
        synchronized (this.content2UrlMap) {
            this.content2UrlMap.put(contentMD5, str);
        }
    }

    private final RobotTask createRobotTask(URL url, int i, String str) {
        return new RobotTask(removeWasteParameters(url), i, str);
    }

    public void setWasteParameters(Vector vector) {
        this.wasteParameters = vector;
    }

    public Vector getWasteParameters() {
        return this.wasteParameters;
    }

    public URL removeWasteParameters(URL url) {
        String externalForm = url.toExternalForm();
        String removeParametersFromString = removeParametersFromString(externalForm, this.wasteParameters);
        if (externalForm != removeParametersFromString) {
            try {
                url = new URL(removeParametersFromString);
            } catch (MalformedURLException e) {
                e.printStackTrace();
            }
        }
        return url;
    }

    public static String removeParametersFromString(String str, Vector vector) {
        int indexOf;
        String substring;
        String substring2;
        if (vector != null && vector.size() > 0 && (indexOf = str.indexOf(LocationInfo.NA)) > 0 && indexOf < str.length()) {
            int indexOf2 = str.indexOf("#", indexOf);
            if (indexOf2 < 0) {
                substring = str.substring(indexOf + 1);
                substring2 = null;
            } else {
                substring = str.substring(indexOf + 1, indexOf2);
                substring2 = str.substring(indexOf2);
            }
            StringBuffer stringBuffer = new StringBuffer(str.substring(0, indexOf));
            StringTokenizer stringTokenizer = new StringTokenizer(substring, BeanFactory.FACTORY_BEAN_PREFIX);
            String str2 = LocationInfo.NA;
            boolean z = false;
            while (stringTokenizer.hasMoreTokens()) {
                String nextToken = stringTokenizer.nextToken();
                boolean z2 = true;
                int i = 0;
                while (true) {
                    if (i >= vector.size()) {
                        break;
                    }
                    if (nextToken.startsWith(((String) vector.elementAt(i)) + "=")) {
                        z2 = false;
                        z = true;
                        break;
                    }
                    i++;
                }
                if (z2) {
                    stringBuffer.append(str2);
                    stringBuffer.append(nextToken);
                    str2 = BeanFactory.FACTORY_BEAN_PREFIX;
                }
            }
            if (substring2 != null) {
                stringBuffer.append(substring2);
            }
            if (z) {
                str = stringBuffer.toString();
            }
        }
        return str;
    }

    public void setMaxRetries(int i) {
        this.maxRetries = i;
    }

    public int getMaxRetries() {
        return this.maxRetries;
    }

    public void setExpirationAge(long j) {
        this.expirationAge = j;
    }

    public long getExpirationAge() {
        return this.expirationAge;
    }

    private static final String removeParameters(String str) {
        int indexOf = str.indexOf(LocationInfo.NA);
        return indexOf >= 0 ? str.substring(0, indexOf) : str;
    }

    protected byte[] readFileToByteArray(File file) throws IOException {
        FileInputStream fileInputStream = null;
        try {
            byte[] bArr = new byte[(int) file.length()];
            fileInputStream = new FileInputStream(file);
            fileInputStream.read(bArr);
            if (fileInputStream != null) {
                try {
                    fileInputStream.close();
                } catch (IOException e) {
                }
            }
            return bArr;
        } catch (Throwable th) {
            if (fileInputStream != null) {
                try {
                    fileInputStream.close();
                } catch (IOException e2) {
                }
            }
            throw th;
        }
    }
}
