package org.archive.modules.extractor;

import java.io.InputStream;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.archive.io.ReplayInputStream;
import org.archive.io.SeekReader;
import org.archive.io.SeekReaderCharSequence;
import org.archive.modules.CrawlURI;
import org.archive.net.UURIFactory;
import org.archive.util.ms.Doc;

/* loaded from: input_file:org/archive/modules/extractor/ExtractorDOC.class */
public class ExtractorDOC extends ContentExtractor {
    private static final long serialVersionUID = 3;
    private static Pattern PATTERN = Pattern.compile("HYPERLINK.*?\"(.*?)\"");
    private static Logger logger = Logger.getLogger("org.archive.crawler.extractor.ExtractorDOC");

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean shouldExtract(CrawlURI crawlURI) {
        String contentType = crawlURI.getContentType();
        if (contentType == null) {
            return false;
        }
        return contentType.toLowerCase().startsWith("application/msword");
    }

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean innerExtract(CrawlURI crawlURI) {
        int i = 0;
        InputStream inputStream = null;
        try {
            try {
                inputStream = crawlURI.getRecorder().getContentReplayInputStream();
                if (inputStream == null) {
                    IOUtils.closeQuietly(inputStream);
                    return false;
                }
                ReplayInputStream replayInputStream = new ReplayInputStream(inputStream);
                SeekReader text = Doc.getText(replayInputStream);
                IOUtils.closeQuietly(inputStream);
                Matcher matcher = PATTERN.matcher(new SeekReaderCharSequence(text, 0));
                while (matcher.find()) {
                    i++;
                    addLink(crawlURI, matcher.group(1));
                }
                replayInputStream.destroy();
                logger.fine(crawlURI + " has " + i + " links.");
                return true;
            } catch (Exception e) {
                crawlURI.getNonFatalFailures().add(e);
                IOUtils.closeQuietly(inputStream);
                return false;
            }
        } catch (Throwable th) {
            IOUtils.closeQuietly(inputStream);
            throw th;
        }
    }

    private void addLink(CrawlURI crawlURI, String str) {
        try {
            crawlURI.getOutLinks().add(new Link(crawlURI.getUURI(), UURIFactory.getInstance(crawlURI.getUURI(), str), LinkContext.NAVLINK_MISC, Hop.NAVLINK));
        } catch (URIException e) {
            logUriError(e, crawlURI.getUURI(), str);
        }
        this.numberOfLinksExtracted.incrementAndGet();
    }
}
