package org.archive.modules.writer;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.URI;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.archive.io.ReplayInputStream;
import org.archive.io.warc.WARCConstants;
import org.archive.io.warc.WARCWriter;
import org.archive.io.warc.WARCWriterPool;
import org.archive.io.warc.WARCWriterPoolSettings;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.deciderules.recrawl.IdenticalDigestDecideRule;
import org.archive.modules.extractor.Link;
import org.archive.modules.recrawl.RecrawlAttributeConstants;
import org.archive.spring.ConfigPath;
import org.archive.uid.RecordIDGenerator;
import org.archive.uid.UUIDGenerator;
import org.archive.util.ArchiveUtils;
import org.archive.util.anvl.ANVLRecord;

/* loaded from: input_file:WEB-INF/lib/heritrix-modules-3.1.0.jar:org/archive/modules/writer/WARCWriterProcessor.class */
public class WARCWriterProcessor extends WriterPoolProcessor implements WARCWriterPoolSettings {
    private static final long serialVersionUID = 6182850087635847443L;
    private static final Logger logger = Logger.getLogger(WARCWriterProcessor.class.getName());
    private ConcurrentMap<String, ConcurrentMap<String, AtomicLong>> stats = new ConcurrentHashMap();
    private AtomicLong urlsWritten = new AtomicLong();
    RecordIDGenerator generator;
    private transient List<String> cachedMetadata;

    @Override // org.archive.modules.writer.WriterPoolProcessor
    public long getDefaultMaxFileSize() {
        return 1000000000L;
    }

    @Override // org.archive.modules.writer.WriterPoolProcessor
    public List<ConfigPath> getDefaultStorePaths() {
        ArrayList arrayList = new ArrayList();
        arrayList.add(new ConfigPath("warcs default store path", "warcs"));
        return arrayList;
    }

    public boolean getWriteRequests() {
        return ((Boolean) this.kp.get("writeRequests")).booleanValue();
    }

    public void setWriteRequests(boolean z) {
        this.kp.put("writeRequests", Boolean.valueOf(z));
    }

    public boolean getWriteMetadata() {
        return ((Boolean) this.kp.get("writeMetadata")).booleanValue();
    }

    public void setWriteMetadata(boolean z) {
        this.kp.put("writeMetadata", Boolean.valueOf(z));
    }

    public boolean getWriteRevisitForIdenticalDigests() {
        return ((Boolean) this.kp.get("writeRevisitForIdenticalDigests")).booleanValue();
    }

    public void setWriteRevisitForIdenticalDigests(boolean z) {
        this.kp.put("writeRevisitForIdenticalDigests", Boolean.valueOf(z));
    }

    public boolean getWriteRevisitForNotModified() {
        return ((Boolean) this.kp.get("writeRevisitForNotModified")).booleanValue();
    }

    public void setWriteRevisitForNotModified(boolean z) {
        this.kp.put("writeRevisitForNotModified", Boolean.valueOf(z));
    }

    @Override // org.archive.io.warc.WARCWriterPoolSettings
    public RecordIDGenerator getRecordIDGenerator() {
        return this.generator;
    }

    public void setRecordIDGenerator(RecordIDGenerator recordIDGenerator) {
        this.generator = recordIDGenerator;
    }

    public WARCWriterProcessor() {
        setWriteRequests(true);
        setWriteMetadata(true);
        setWriteRevisitForIdenticalDigests(true);
        setWriteRevisitForNotModified(true);
        this.generator = new UUIDGenerator();
    }

    @Override // org.archive.modules.writer.WriterPoolProcessor
    protected void setupPool(AtomicInteger atomicInteger) {
        setPool(new WARCWriterPool(atomicInteger, this, getPoolMaxActive(), getMaxWaitForIdleMs()));
    }

    @Override // org.archive.modules.writer.WriterPoolProcessor, org.archive.modules.Processor
    protected ProcessResult innerProcessResult(CrawlURI crawlURI) {
        String lowerCase = crawlURI.getUURI().getScheme().toLowerCase();
        try {
        } catch (IOException e) {
            crawlURI.getNonFatalFailures().add(e);
            logger.log(Level.SEVERE, "Failed write of Records: " + crawlURI.toString(), (Throwable) e);
        }
        if (shouldWrite(crawlURI)) {
            return write(lowerCase, crawlURI);
        }
        copyForwardWriteTagIfDupe(crawlURI);
        return ProcessResult.PROCEED;
    }

    /*  JADX ERROR: NullPointerException in pass: RegionMakerVisitor
        java.lang.NullPointerException
        */
    protected org.archive.modules.ProcessResult write(java.lang.String r9, org.archive.modules.CrawlURI r10) throws java.io.IOException {
        /*
            Method dump skipped, instructions count: 445
            To view this dump add '--comments-level debug' option
        */
        throw new UnsupportedOperationException("Method not decompiled: org.archive.modules.writer.WARCWriterProcessor.write(java.lang.String, org.archive.modules.CrawlURI):org.archive.modules.ProcessResult");
    }

    protected void addStats(Map<String, Map<String, Long>> map) {
        for (String str : map.keySet()) {
            if (this.stats.get(str) == null) {
                this.stats.putIfAbsent(str, new ConcurrentHashMap());
            }
            for (String str2 : map.get(str).keySet()) {
                AtomicLong atomicLong = this.stats.get(str).get(str2);
                if (atomicLong == null) {
                    atomicLong = this.stats.get(str).putIfAbsent(str2, new AtomicLong(map.get(str).get(str2).longValue()));
                }
                if (atomicLong != null) {
                    atomicLong.addAndGet(map.get(str).get(str2).longValue());
                }
            }
        }
    }

    private void writeDnsRecords(CrawlURI crawlURI, WARCWriter wARCWriter, URI uri, String str) throws IOException {
        ANVLRecord aNVLRecord = null;
        String str2 = (String) crawlURI.getData().get(CoreAttributeConstants.A_DNS_SERVER_IP_LABEL);
        if (str2 != null && str2.length() > 0) {
            aNVLRecord = new ANVLRecord(1);
            aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_IP, str2);
        }
        writeResponse(wARCWriter, str, crawlURI.getContentType(), uri, crawlURI, aNVLRecord);
    }

    private void writeWhoisRecords(WARCWriter wARCWriter, CrawlURI crawlURI, URI uri, String str) throws IOException {
        writeResponse(wARCWriter, str, crawlURI.getContentType(), uri, crawlURI, null);
    }

    private void writeHttpRecords(CrawlURI crawlURI, WARCWriter wARCWriter, URI uri, String str) throws IOException {
        URI writeResponse;
        ANVLRecord aNVLRecord = new ANVLRecord(5);
        if (crawlURI.getContentDigest() != null) {
            aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST, crawlURI.getContentDigestSchemeString());
        }
        aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_IP, getHostAddress(crawlURI));
        if (IdenticalDigestDecideRule.hasIdenticalDigest(crawlURI) && getWriteRevisitForIdenticalDigests()) {
            writeResponse = writeRevisitDigest(wARCWriter, str, WARCConstants.HTTP_RESPONSE_MIMETYPE, uri, crawlURI, aNVLRecord);
        } else if (crawlURI.getFetchStatus() == 304 && getWriteRevisitForNotModified()) {
            writeResponse = writeRevisitNotModified(wARCWriter, str, uri, crawlURI, aNVLRecord);
        } else {
            String str2 = null;
            Collection<String> annotations = crawlURI.getAnnotations();
            if (annotations.contains("timeTrunc")) {
                str2 = "time";
            } else if (annotations.contains("lenTrunc")) {
                str2 = "length";
            } else if (annotations.contains("headerTrunc")) {
                str2 = WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_HEAD;
            }
            if (str2 != null) {
                aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_TRUNCATED, str2);
            }
            writeResponse = writeResponse(wARCWriter, str, WARCConstants.HTTP_RESPONSE_MIMETYPE, uri, crawlURI, aNVLRecord);
        }
        ANVLRecord aNVLRecord2 = new ANVLRecord(1);
        aNVLRecord2.addLabelValue(WARCConstants.HEADER_KEY_CONCURRENT_TO, '<' + writeResponse.toString() + '>');
        if (getWriteRequests()) {
            writeRequest(wARCWriter, str, WARCConstants.HTTP_REQUEST_MIMETYPE, uri, crawlURI, aNVLRecord2);
        }
        if (getWriteMetadata()) {
            writeMetadata(wARCWriter, str, uri, crawlURI, aNVLRecord2);
        }
    }

    private void writeFtpRecords(WARCWriter wARCWriter, CrawlURI crawlURI, URI uri, String str) throws IOException {
        ANVLRecord aNVLRecord = new ANVLRecord(3);
        aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_IP, getHostAddress(crawlURI));
        URI writeFtpControlConversation = writeFtpControlConversation(wARCWriter, str, uri, crawlURI, aNVLRecord, crawlURI.getData().get(CoreAttributeConstants.A_FTP_CONTROL_CONVERSATION).toString());
        if (crawlURI.getContentDigest() != null) {
            aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST, crawlURI.getContentDigestSchemeString());
        }
        if (crawlURI.getRecorder() != null) {
            if (IdenticalDigestDecideRule.hasIdenticalDigest(crawlURI) && getWriteRevisitForIdenticalDigests()) {
                writeFtpControlConversation = writeRevisitDigest(wARCWriter, str, null, uri, crawlURI, aNVLRecord, 0L);
            } else {
                ANVLRecord aNVLRecord2 = new ANVLRecord(3);
                String str2 = null;
                Collection<String> annotations = crawlURI.getAnnotations();
                if (annotations.contains("timeTrunc")) {
                    str2 = "time";
                } else if (annotations.contains("lenTrunc")) {
                    str2 = "length";
                } else if (annotations.contains("headerTrunc")) {
                    str2 = WARCConstants.NAMED_FIELD_TRUNCATED_VALUE_HEAD;
                }
                if (str2 != null) {
                    aNVLRecord2.addLabelValue(WARCConstants.HEADER_KEY_TRUNCATED, str2);
                }
                if (crawlURI.getContentDigest() != null) {
                    aNVLRecord2.addLabelValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST, crawlURI.getContentDigestSchemeString());
                }
                aNVLRecord2.addLabelValue(WARCConstants.HEADER_KEY_CONCURRENT_TO, '<' + writeFtpControlConversation.toString() + '>');
                writeFtpControlConversation = writeResource(wARCWriter, str, crawlURI.getContentType(), uri, crawlURI, aNVLRecord2);
            }
        }
        if (getWriteMetadata()) {
            ANVLRecord aNVLRecord3 = new ANVLRecord(1);
            aNVLRecord3.addLabelValue(WARCConstants.HEADER_KEY_CONCURRENT_TO, '<' + writeFtpControlConversation.toString() + '>');
            writeMetadata(wARCWriter, str, uri, crawlURI, aNVLRecord3);
        }
    }

    protected URI writeFtpControlConversation(WARCWriter wARCWriter, String str, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord, String str2) throws IOException {
        URI qualifyRecordID = qualifyRecordID(uri, "type", "metadata");
        wARCWriter.writeMetadataRecord(crawlURI.toString(), str, WARCConstants.FTP_CONTROL_CONVERSATION_MIMETYPE, qualifyRecordID, aNVLRecord, new ByteArrayInputStream(str2.getBytes("UTF-8")), r0.length);
        return qualifyRecordID;
    }

    protected URI writeRequest(WARCWriter wARCWriter, String str, String str2, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord) throws IOException {
        URI qualifyRecordID = qualifyRecordID(uri, "type", "request");
        ReplayInputStream replayInputStream = crawlURI.getRecorder().getRecordedOutput().getReplayInputStream();
        try {
            wARCWriter.writeRequestRecord(crawlURI.toString(), str, str2, qualifyRecordID, aNVLRecord, replayInputStream, crawlURI.getRecorder().getRecordedOutput().getSize());
            return qualifyRecordID;
        } finally {
            IOUtils.closeQuietly((InputStream) replayInputStream);
        }
    }

    protected URI writeResponse(WARCWriter wARCWriter, String str, String str2, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord) throws IOException {
        ReplayInputStream replayInputStream = crawlURI.getRecorder().getRecordedInput().getReplayInputStream();
        try {
            wARCWriter.writeResponseRecord(crawlURI.toString(), str, str2, uri, aNVLRecord, replayInputStream, crawlURI.getRecorder().getRecordedInput().getSize());
            return uri;
        } finally {
            IOUtils.closeQuietly((InputStream) replayInputStream);
        }
    }

    protected URI writeResource(WARCWriter wARCWriter, String str, String str2, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord) throws IOException {
        ReplayInputStream replayInputStream = crawlURI.getRecorder().getRecordedInput().getReplayInputStream();
        try {
            wARCWriter.writeResourceRecord(crawlURI.toString(), str, str2, uri, aNVLRecord, replayInputStream, crawlURI.getRecorder().getRecordedInput().getSize());
            return uri;
        } finally {
            IOUtils.closeQuietly((InputStream) replayInputStream);
        }
    }

    protected URI writeRevisitDigest(WARCWriter wARCWriter, String str, String str2, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord) throws IOException {
        long contentBegin = crawlURI.getRecorder().getRecordedInput().getContentBegin();
        return writeRevisitDigest(wARCWriter, str, str2, uri, crawlURI, aNVLRecord, contentBegin > 0 ? contentBegin : crawlURI.getRecorder().getRecordedInput().getSize());
    }

    protected URI writeRevisitDigest(WARCWriter wARCWriter, String str, String str2, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord, long j) throws IOException {
        aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_PROFILE, WARCConstants.PROFILE_REVISIT_IDENTICAL_DIGEST);
        aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_TRUNCATED, "length");
        ReplayInputStream replayInputStream = crawlURI.getRecorder().getRecordedInput().getReplayInputStream();
        try {
            wARCWriter.writeRevisitRecord(crawlURI.toString(), str, str2, uri, aNVLRecord, replayInputStream, j);
            crawlURI.getAnnotations().add("warcRevisit:digest");
            return uri;
        } finally {
            IOUtils.closeQuietly((InputStream) replayInputStream);
        }
    }

    protected URI writeRevisitNotModified(WARCWriter wARCWriter, String str, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord) throws IOException {
        aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_PROFILE, WARCConstants.PROFILE_REVISIT_NOT_MODIFIED);
        if (crawlURI.isHttpTransaction()) {
            HttpMethod httpMethod = crawlURI.getHttpMethod();
            saveHeader(RecrawlAttributeConstants.A_ETAG_HEADER, httpMethod, aNVLRecord, WARCConstants.HEADER_KEY_ETAG);
            saveHeader(RecrawlAttributeConstants.A_LAST_MODIFIED_HEADER, httpMethod, aNVLRecord, WARCConstants.HEADER_KEY_LAST_MODIFIED);
        }
        aNVLRecord.addLabelValue(WARCConstants.HEADER_KEY_TRUNCATED, "length");
        ReplayInputStream replayInputStream = crawlURI.getRecorder().getRecordedInput().getReplayInputStream();
        try {
            wARCWriter.writeRevisitRecord(crawlURI.toString(), str, null, uri, aNVLRecord, replayInputStream, 0L);
            crawlURI.getAnnotations().add("warcRevisit:notModified");
            return uri;
        } finally {
            IOUtils.closeQuietly((InputStream) replayInputStream);
        }
    }

    protected void saveHeader(String str, HttpMethod httpMethod, ANVLRecord aNVLRecord, String str2) {
        Header responseHeader = httpMethod.getResponseHeader(str);
        if (responseHeader != null) {
            aNVLRecord.addLabelValue(str2, responseHeader.getValue());
        }
    }

    protected URI writeMetadata(WARCWriter wARCWriter, String str, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord) throws IOException {
        URI qualifyRecordID = qualifyRecordID(uri, "type", "metadata");
        ANVLRecord aNVLRecord2 = new ANVLRecord();
        if (crawlURI.isSeed()) {
            aNVLRecord2.addLabel("seed");
        } else {
            if (crawlURI.forceFetch()) {
                aNVLRecord2.addLabel("force-fetch");
            }
            if (StringUtils.isNotBlank(flattenVia(crawlURI))) {
                aNVLRecord2.addLabelValue("via", flattenVia(crawlURI));
            }
            if (StringUtils.isNotBlank(crawlURI.getPathFromSeed())) {
                aNVLRecord2.addLabelValue("hopsFromSeed", crawlURI.getPathFromSeed());
            }
            if (crawlURI.containsDataKey("source")) {
                aNVLRecord2.addLabelValue("sourceTag", (String) crawlURI.getData().get("source"));
            }
        }
        long fetchCompletedTime = crawlURI.getFetchCompletedTime() - crawlURI.getFetchBeginTime();
        if (fetchCompletedTime > -1) {
            aNVLRecord2.addLabelValue("fetchTimeMs", Long.toString(fetchCompletedTime));
        }
        if (crawlURI.getData().containsKey(CoreAttributeConstants.A_FTP_FETCH_STATUS)) {
            aNVLRecord2.addLabelValue("ftpFetchStatus", crawlURI.getData().get(CoreAttributeConstants.A_FTP_FETCH_STATUS).toString());
        }
        Collection<Link> outLinks = crawlURI.getOutLinks();
        if (outLinks != null && outLinks.size() > 0) {
            Iterator<Link> it = outLinks.iterator();
            while (it.hasNext()) {
                aNVLRecord2.addLabelValue("outlink", it.next().toString());
            }
        }
        wARCWriter.writeMetadataRecord(crawlURI.toString(), str, ANVLRecord.MIMETYPE, qualifyRecordID, aNVLRecord, new ByteArrayInputStream(aNVLRecord2.getUTF8Bytes()), r0.length);
        return qualifyRecordID;
    }

    protected URI getRecordID() throws IOException {
        return this.generator.getRecordID();
    }

    protected URI qualifyRecordID(URI uri, String str, String str2) throws IOException {
        HashMap hashMap = new HashMap(1);
        hashMap.put(str, str2);
        return this.generator.qualifyRecordID(uri, hashMap);
    }

    @Override // org.archive.modules.writer.WriterPoolProcessor, org.archive.io.WriterPoolSettings
    public List<String> getMetadata() {
        if (this.cachedMetadata != null) {
            return this.cachedMetadata;
        }
        ANVLRecord aNVLRecord = new ANVLRecord(7);
        aNVLRecord.addLabelValue("software", "Heritrix/" + ArchiveUtils.VERSION + " http://crawler.archive.org");
        try {
            InetAddress localHost = InetAddress.getLocalHost();
            aNVLRecord.addLabelValue("ip", localHost.getHostAddress());
            aNVLRecord.addLabelValue("hostname", localHost.getCanonicalHostName());
        } catch (UnknownHostException e) {
            logger.log(Level.WARNING, "unable top obtain local crawl engine host", (Throwable) e);
        }
        aNVLRecord.addLabelValue("format", "WARC File Format 1.0");
        aNVLRecord.addLabelValue("conformsTo", "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
        CrawlMetadata metadataProvider = getMetadataProvider();
        addIfNotBlank(aNVLRecord, "operator", metadataProvider.getOperator());
        addIfNotBlank(aNVLRecord, "publisher", metadataProvider.getOrganization());
        addIfNotBlank(aNVLRecord, "audience", metadataProvider.getAudience());
        addIfNotBlank(aNVLRecord, "isPartOf", metadataProvider.getJobName());
        addIfNotBlank(aNVLRecord, "description", metadataProvider.getDescription());
        addIfNotBlank(aNVLRecord, "robots", metadataProvider.getRobotsPolicyName().toLowerCase());
        addIfNotBlank(aNVLRecord, "http-header-user-agent", metadataProvider.getUserAgent());
        addIfNotBlank(aNVLRecord, "http-header-from", metadataProvider.getOperatorFrom());
        return Collections.singletonList(aNVLRecord.toString());
    }

    protected void addIfNotBlank(ANVLRecord aNVLRecord, String str, String str2) {
        if (StringUtils.isNotBlank(str2)) {
            aNVLRecord.addLabelValue(str, str2);
        }
    }

    @Override // org.archive.modules.Processor
    public String report() {
        logger.info("final stats: " + this.stats);
        StringBuilder sb = new StringBuilder();
        sb.append("Processor: " + getClass().getName() + "\n");
        sb.append("  Function:          Writes WARCs\n");
        sb.append("  Total CrawlURIs:   " + this.urlsWritten + "\n");
        sb.append("  Revisit records:   " + WARCWriter.getStat(this.stats, WARCConstants.REVISIT, WARCWriter.NUM_RECORDS) + "\n");
        long stat = WARCWriter.getStat(this.stats, "response", WARCWriter.CONTENT_BYTES) + WARCWriter.getStat(this.stats, "resource", WARCWriter.CONTENT_BYTES);
        sb.append("  Crawled content bytes (including http headers): " + stat + " (" + ArchiveUtils.formatBytesForDisplay(stat) + ")\n");
        long stat2 = WARCWriter.getStat(this.stats, WARCWriter.TOTALS, WARCWriter.TOTAL_BYTES);
        sb.append("  Total uncompressed bytes (including all warc records): " + stat2 + " (" + ArchiveUtils.formatBytesForDisplay(stat2) + ")\n");
        sb.append("  Total size on disk (" + (getCompress() ? "compressed" : "uncompressed") + "): " + getTotalBytesWritten() + " (" + ArchiveUtils.formatBytesForDisplay(getTotalBytesWritten()) + ")\n");
        return sb.toString();
    }
}
