package org.archive.modules.extractor;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringEscapeUtils;
import org.archive.modules.CrawlURI;
import org.archive.util.TextUtils;
import org.archive.util.UriUtils;

/* loaded from: input_file:org/archive/modules/extractor/ExtractorXML.class */
public class ExtractorXML extends ContentExtractor {
    private static final long serialVersionUID = 3;
    private static Logger logger = Logger.getLogger(ExtractorXML.class.getName());
    static final Pattern XML_URI_EXTRACTOR = Pattern.compile("(?s)[\"'>]\\s*([^<>\\s'\"@]+)\\s*[\"'<]");

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean shouldExtract(CrawlURI crawlURI) {
        String contentType = crawlURI.getContentType();
        if ((contentType != null && contentType.toLowerCase().indexOf("xml") >= 0 && !contentType.matches("(?i)application/vnd.openxmlformats.*")) || crawlURI.toString().toLowerCase().endsWith(".rss") || crawlURI.toString().toLowerCase().endsWith(".xml")) {
            return true;
        }
        String contentReplayPrefixString = crawlURI.getRecorder().getContentReplayPrefixString(400);
        return contentReplayPrefixString.matches("(?is)[\\ufeff]?<\\?xml\\s.*") && !contentReplayPrefixString.matches("(?is).*(?:<!doctype\\s+html|<html[>\\s]).*");
    }

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean innerExtract(CrawlURI crawlURI) {
        try {
            if (!crawlURI.containsContentTypeCharsetDeclaration()) {
                Charset contentDeclaredCharset = getContentDeclaredCharset(crawlURI, crawlURI.getRecorder().getContentReplayPrefixString(50));
                if (!crawlURI.getRecorder().getCharset().equals(contentDeclaredCharset) && contentDeclaredCharset != null) {
                    if (contentDeclaredCharset.equals(getContentDeclaredCharset(crawlURI, crawlURI.getRecorder().getContentReplayPrefixString(50, contentDeclaredCharset)))) {
                        crawlURI.getAnnotations().add("usingCharsetInXML:" + contentDeclaredCharset);
                        crawlURI.getRecorder().setCharset(contentDeclaredCharset);
                    } else {
                        crawlURI.getAnnotations().add("inconsistentCharsetInXML:" + contentDeclaredCharset);
                    }
                }
            }
            this.numberOfLinksExtracted.addAndGet(processXml(this, crawlURI, crawlURI.getRecorder().getContentReplayCharSequence()));
            return true;
        } catch (IOException e) {
            logger.severe("Failed getting ReplayCharSequence: " + e.getMessage());
            return false;
        }
    }

    protected Charset getContentDeclaredCharset(CrawlURI crawlURI, String str) {
        Matcher matcher = TextUtils.getMatcher("(?s)<\\?xml\\s+[^>]*encoding=['\"]([^'\"]+)['\"]", str);
        String str2 = null;
        try {
            if (!matcher.find()) {
                return null;
            }
            str2 = matcher.group(1);
            return Charset.forName(str2);
        } catch (IllegalArgumentException e) {
            logger.log(Level.INFO, "Unknown content-encoding '" + str2 + "' declared; using default");
            crawlURI.getAnnotations().add("unsatisfiableCharsetInXML:" + str2);
            return null;
        } finally {
            TextUtils.recycleMatcher(matcher);
        }
    }

    public static long processXml(Extractor extractor, CrawlURI crawlURI, CharSequence charSequence) {
        long j = 0;
        Matcher matcher = XML_URI_EXTRACTOR.matcher(charSequence);
        while (matcher.find()) {
            String unescapeXml = StringEscapeUtils.unescapeXml(matcher.group(1));
            if (UriUtils.isLikelyUri(unescapeXml)) {
                j++;
                try {
                    Link.addRelativeToBase(crawlURI, extractor.getExtractorParameters().getMaxOutlinks(), unescapeXml, LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
                } catch (URIException e) {
                    extractor.logUriError(e, crawlURI.getUURI(), unescapeXml);
                }
            }
        }
        return j;
    }
}
