package eu.openaire.publications_retriever.crawler;

import ch.qos.logback.core.joran.spi.ConfigurationWatchList;
import crawlercommons.sitemaps.extension.LinkAttributes;
import eu.openaire.publications_retriever.exceptions.ConnTimeoutException;
import eu.openaire.publications_retriever.exceptions.DocLinkFoundException;
import eu.openaire.publications_retriever.exceptions.DocLinkInvalidException;
import eu.openaire.publications_retriever.exceptions.DocLinkUnavailableException;
import eu.openaire.publications_retriever.exceptions.DomainBlockedException;
import eu.openaire.publications_retriever.exceptions.DomainWithUnsupportedHEADmethodException;
import eu.openaire.publications_retriever.exceptions.DynamicInternalLinksFoundException;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.args.ArgsUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlTypeChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import java.io.BufferedReader;
import java.net.HttpURLConnection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:eu/openaire/publications_retriever/crawler/PageCrawler.class */
public class PageCrawler {
    public static final int timesToGiveNoInternalLinksBeforeBlocked = 200;
    public static final int timesToGiveNoDocUrlsBeforeBlocked = 100;
    private static final int MAX_INTERNAL_LINKS_TO_ACCEPT_PAGE = 500;
    private static final int MAX_POSSIBLE_DOC_OR_DATASET_LINKS_TO_CONNECT = 5;
    private static final int MAX_REMAINING_INTERNAL_LINKS_TO_CONNECT = 10;
    public static final String spaceOrDashes = "(?:\\s|%20|-|_)*";
    private static final String commonPattern = "website-navigation|reference|su[m]{1,2}ar(?:io|y)(?!.*metadata.*)|author|logo|related(?:\\s|%20|-|_)*product";
    public static final int timesToCheckInternalLinksBeforeEvaluate = 20;
    private static final double leastPercentageOfHitsFromRemainingLinks = 0.2d;
    private static final Logger logger = LoggerFactory.getLogger((Class<?>) PageCrawler.class);
    private static final Pattern INTERNAL_LINKS_STARTING_FROM_FILTER = Pattern.compile("^(?:(?:mailto|tel|fax|file|data|whatsapp|visible|click|text|attr):|\\{openurl}|[/]*\\?(?:locale(?:-attribute)?|ln)=).*");
    public static final Pattern JAVASCRIPT_DOC_LINK = Pattern.compile("javascript:pdflink.*'(http.+)'[\\s]*,.*", 2);
    public static final Pattern JAVASCRIPT_CODE_PDF_LINK = Pattern.compile(".*\"pdfUrl\":\"([^\"]+)\".*");
    public static final ConcurrentHashMap<String, Integer> timesDomainNotGivingInternalLinks = new ConcurrentHashMap<>();
    public static final ConcurrentHashMap<String, Integer> timesDomainNotGivingDocUrls = new ConcurrentHashMap<>();
    public static AtomicInteger contentProblematicUrls = new AtomicInteger(0);
    public static boolean should_check_remaining_links = true;
    public static final Pattern DOCUMENT_TEXT = Pattern.compile("pdf|full(?:\\s|%20|-|_)*text|download|t[ée]l[ée]charger|descargar|texte(?:\\s|%20|-|_)*intégral");
    public static final Pattern NON_VALID_DOCUMENT = Pattern.compile(".*(?:[^e]manu[ae]l|(?:\\|\\|(?:\\s|%20|-|_)*)?gu[ií](?:de|a)|directive[s]?|(?<!readonly_)preview|leaflet|agreement(?!.*thesis(?:\\s|%20|-|_)*(?:19|20)[\\d]{2}.*)|accessibility|journal(?:\\s|%20|-|_)*catalog|disclose(?:\\s|%20|-|_)*file|poli(?:c(?:y|ies)(?!.*paper)|tika(?:si)?)|licen(?:se|cia)(?:\\s|%20|-|_)*(?:of|de)(?:\\s|%20|-|_)*us[eo]|(?:governance|safety)(?:\\s|%20|-|_)*statement|normativa|(?:consumer|hazard|copyright)(?:\\s|%20|-|_)*(?:information|(?:release(?:\\s|%20|-|_)*)?form)|copyright|permission|(?:editorial|review)(?:\\s|%20|-|_)*board|d[ée](?:p(?:ôt[s]?|oser|osit(?!ed))|butez)|cr[ée]er(?:\\s|%20|-|_)*(?:votre|son)|orcid|subscription|instruction|code(?:\\s|%20|-|_)*of(?:\\s|%20|-|_)*conduct|[^_]request|join[^t]|compte|[^_]account|table(?:\\s|%20|-|_)*of(?:\\s|%20|-|_)*contents|(?:front|back|end)(?:\\s|%20|-|_)*matter|information(?:\\s|%20|-|_)*for(?:\\s|%20|-|_)*authors|pdf(?:/a)?(?:\\s|%20|-|_)*conversion|catalogue|factsheet|classifieds|pdf-viewer|certificate(?:\\s|%20|-|_)*of|conflict[s]?(?:\\s|%20|-|_)*of(?:\\s|%20|-|_)*interest|(?:recommendation|order)(?:\\s|%20|-|_)*form|adverti[sz]e|mandatory(?:\\s|%20|-|_)*open(?:\\s|%20|-|_)*access|recommandations(?:\\s|%20|-|_)*pour(?:\\s|%20|-|_)*s'affilier|hal.*collections|terms|conditions|hakuohjeet|logigramme|export_liste_publi|yearbook|pubs_(?:brochure|overview)|thermal-letter|réutiliser(?:\\s|%20|-|_)*des(?:\\s|%20|-|_)*images(?:\\s|%20|-|_)*dans(?:\\s|%20|-|_)*des(?:\\s|%20|-|_)*publications|procedure|規程|運営規程|(?:peer|mini)(?:\\s|%20|-|_)*review|(?:case|annual)(?:\\s|%20|-|_)*report|review(?:\\s|%20|-|_)*article|short(?:\\s|%20|-|_)*communication|letter(?:\\s|%20|-|_)*to(?:\\s|%20|-|_)*editor|how(?:\\s|%20|-|_)*to(?:\\s|%20|-|_)*(?:create|submit|contact)|tutori[ae]l|survey-results|calendar(?:\\s|%20|-|_)*of(?:\\s|%20|-|_)*events|know(?:\\s|%20|-|_)*your(?:\\s|%20|-|_)*rights|your(?:(?:\\s|%20|-|_)*id|cv)(?:\\s|%20|-|_)*hal|présentation(?:\\s|%20|-|_)*portail(?:\\s|%20|-|_)*hal|data-sharing-guidance|rate(?:(?:\\s|%20|-|_)*)?cards|press(?:\\s|%20|-|_)*release|liability(?:\\s|%20|-|_)*disclaimer|(?:avec|dans)(?:\\s|%20|-|_)*(?:ocd|x2)?hal|online(?:\\s|%20|-|_)*flyer|publishing(?:\\s|%20|-|_)*process|book(?:\\s|%20|-|_)*of(?:\\s|%20|-|_)*abstracts|academic(?:\\s|%20|-|_)*social(?:\\s|%20|-|_)*networks|ijcseugcjournalno|manuscript(?:(?:\\s|%20|-|_)*preparation)?(?:\\s|%20|-|_)*checklist|by(?:\\s|%20|-|_)*laws|reglamento(?:\\s|%20|-|_)*de(?:\\s|%20|-|_)*ciencia(?:\\s|%20|-|_)*abierta|^(?:licen[cs]e|help|reprints|pol[ií]ti[kc][sa](?:(?:\\s|%20|-|_)*de(?:\\s|%20|-|_)*informação)?|for(?:\\s|%20|-|_)*recruiters|charte(?:\\s|%20|-|_)*de(?:\\s|%20|-|_)*signature|weekly(?:\\s|%20|-|_)*visitors|publication(?:\\s|%20|-|_)*(?:ethics(?:\\s|%20|-|_)*and(?:\\s|%20|-|_)*malpractice|fees)|redaktion|sample(?:\\s|%20|-|_)*manuscript|open(?:\\s|%20|-|_)*access(?:(?:\\s|%20|-|_)*policy)?)$|/(?:entry|information|opinion|(?:rapportannuel|publerkl|utt_so_|atsc_|tjg_|ictrp_|oproep_voor_artikels_|[^/]*call_for_contributions_)[\\w_()-]*|accesorestringido|library_recommendation_form|research-article|loi_republique_numerique_publis|nutzungsbedingungen|autorenhinweise|mediadaten|canceledpresentations|sscc-facme_cirugia|bir_journals_reprint_form|transparencia|wfme|evolution_de_l_ergonomie|que_pouvez_vous_deposer|ethic-comittee-approval|restri(?:ngido|cted)|ofi[c]+ial|asn(?:\\s|%20|-|_)*tips|aidehelp|.*_doi|(?:b-ent|aces)_.*).pdf(?:\\?.*)?$|kilavuzu|(?:公表|登録)届出書|取扱要領|リポジトリ(?:要項|運用指針)|検索のポイント|について|閲覧方法|ープンアクセスポリシー|されたみなさまへ|(?:論文の|登録)許諾書|著作権利用許諾要件|削除依頼書|ープンアクセス方針|(?:刊行物|個人)単位登録).*");
    private static final Pattern PARENT_CLASS_NAME_FILTER_PATTERN = Pattern.compile("(?:^(?:tab|product-head-bnrs)$|.*(?:website-navigation|reference|su[m]{1,2}ar(?:io|y)(?!.*metadata.*)|author|logo|related(?:\\s|%20|-|_)*product|breadcrumb|su[b]?scri(?:p[tc]i[oó]n|b(?:a|ir)se)|reco[m]{1,2}enda(?:tion|do)|metric|stats|cookie|kapak|accesos-usuario).*)");
    private static final Pattern PARENT_ID_FILTER_PATTERN = Pattern.compile(".*(?:website-navigation|reference|su[m]{1,2}ar(?:io|y)(?!.*metadata.*)|author|logo|related(?:\\s|%20|-|_)*product|other).*");
    public static final AtomicInteger timesCheckedRemainingLinks = new AtomicInteger(0);
    public static final AtomicInteger timesFoundDocOrDatasetUrlFromRemainingLinks = new AtomicInteger(0);

    public static void visit(String str, String str2, String str3, String str4, HttpURLConnection httpURLConnection, String str5, BufferedReader bufferedReader) {
        HashSet<String> retrieveInternalLinks;
        String str6;
        logger.debug("Visiting pageUrl: \"" + str3 + "\".");
        String domainStr = UrlUtils.getDomainStr(str3, null);
        if (domainStr == null) {
            logger.warn("Problematic URL in \"PageCrawler.visit()\": \"" + str3 + "\"");
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in PageCrawler.visit() method, after the occurrence of a domain-retrieval error.", null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
            LoaderAndChecker.connProblematicUrls.incrementAndGet();
            ConnSupportUtils.closeBufferedReader(bufferedReader);
            return;
        }
        String htmlString = ConnSupportUtils.getHtmlString(httpURLConnection, bufferedReader, false);
        String str7 = htmlString;
        if (htmlString == null) {
            logger.warn("Could not retrieve the HTML-code for pageUrl: " + str3);
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving its HTML-code. Its contentType is: '" + str4 + "'.", null, true, "true", "true", "false", "false", "true", null, "null", "N/A");
            LoaderAndChecker.connProblematicUrls.incrementAndGet();
            return;
        }
        if (str5 != null) {
            str7 = str5 + str7;
        }
        if (LoaderAndChecker.retrieveDocuments && domainStr.contains("turkjgastroenterol.org")) {
            SpecialUrlsHandler.extractAndCheckTurkjgastroenterolDocUrl(str7, str, str2, str3, domainStr);
            return;
        }
        if (MetadataHandler.checkAndHandleMetadata(str, str2, str3, domainStr, str7) || (retrieveInternalLinks = retrieveInternalLinks(str, str2, str3, domainStr, str7, str4)) == null) {
            return;
        }
        HashSet hashSet = new HashSet(retrieveInternalLinks.size());
        int i = 0;
        Iterator<String> it = retrieveInternalLinks.iterator();
        while (it.hasNext()) {
            String next = it.next();
            if (0 == 0) {
                String fullyFormedUrl = ConnSupportUtils.getFullyFormedUrl(str3, next, null);
                if (fullyFormedUrl != null) {
                    String filter = LoaderAndChecker.basicURLNormalizer.filter(fullyFormedUrl);
                    str6 = filter;
                    if (filter == null) {
                    }
                }
                logger.warn("Could not normalize internal url: " + next);
            } else {
                str6 = next;
            }
            IdUrlMimeTypeTriple idUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(str6);
            if (idUrlMimeTypeTriple != null) {
                ConnSupportUtils.handleReCrossedDocUrl(str, str2, str3, str6, idUrlMimeTypeTriple, false);
                return;
            }
            String lowerCase = str6.toLowerCase();
            if ((!LoaderAndChecker.retrieveDocuments || !LoaderAndChecker.DOC_URL_FILTER.matcher(lowerCase).matches()) && (!LoaderAndChecker.retrieveDatasets || !LoaderAndChecker.DATASET_URL_FILTER.matcher(lowerCase).matches())) {
                hashSet.add(str6);
            } else if (UrlUtils.duplicateUrls.contains(str6)) {
                continue;
            } else if (UrlTypeChecker.shouldNotAcceptInternalLink(str6, lowerCase)) {
                UrlUtils.duplicateUrls.add(str6);
            } else {
                i++;
                if (i > 5) {
                    logger.warn("The maximum limit (5) of possible doc or dataset links to be connected was reached for pageUrl: \"" + str3 + "\". The page was discarded.");
                    handlePageWithNoDocOrDatasetUrls(str, str2, str3, domainStr, true, false);
                    return;
                }
                try {
                    if (HttpConnUtils.connectAndCheckMimeType(str, str2, str3, str6, null, false, true)) {
                        return;
                    } else {
                        UrlUtils.duplicateUrls.add(str6);
                    }
                } catch (ConnTimeoutException e) {
                    if (str6.contains(domainStr)) {
                        logger.warn("Page: \"" + str3 + "\" left \"PageCrawler.visit()\" after a potentialDocUrl caused a ConnTimeoutException.");
                        UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.visit()' method, as an internalLink of this page caused 'ConnTimeoutException'.", null, true, "true", "true", "false", "false", "true", null, "null", "N/A");
                        LoaderAndChecker.connProblematicUrls.incrementAndGet();
                        return;
                    }
                } catch (DomainBlockedException e2) {
                    String message = e2.getMessage();
                    if (message == null) {
                        continue;
                    } else if (message.contains(domainStr)) {
                        logger.warn("Page: \"" + str3 + "\" left \"PageCrawler.visit()\" after it's domain was blocked.");
                        UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.visit()' method, as its domain was blocked during crawling.", null, true, "true", "true", "false", "false", LoaderAndChecker.COULD_RETRY_URLS.matcher(str3).matches() ? "true" : "false", null, "null", "N/A");
                        LoaderAndChecker.connProblematicUrls.incrementAndGet();
                        return;
                    }
                } catch (RuntimeException e3) {
                    UrlUtils.duplicateUrls.add(str6);
                } catch (Exception e4) {
                    logger.error("Error when processing the url: " + str6, (Throwable) e4);
                }
            }
        }
        if (!should_check_remaining_links || hashSet.isEmpty()) {
            handlePageWithNoDocOrDatasetUrls(str, str2, str3, domainStr, false, false);
        } else {
            checkRemainingInternalLinks(str, str2, str3, domainStr, hashSet);
        }
    }

    private static void handlePageWithNoDocOrDatasetUrls(String str, String str2, String str3, String str4, boolean z, boolean z2) {
        if (!z) {
            logger.warn("Page: \"" + str3 + "\" does not contain a " + ArgsUtils.targetUrlType + ".");
        }
        UrlTypeChecker.pagesNotProvidingDocUrls.incrementAndGet();
        if (!z2) {
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.visit()' method, as no " + ArgsUtils.targetUrlType + " was found inside.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
        }
        if (ConnSupportUtils.countAndBlockDomainAfterTimes(HttpConnUtils.blacklistedDomains, timesDomainNotGivingDocUrls, str4, 100, true)) {
            logger.warn("Domain: \"" + str4 + "\" was blocked after giving no " + ArgsUtils.targetUrlType + " more than 100 times.");
        }
    }

    public static HashSet<String> retrieveInternalLinks(String str, String str2, String str3, String str4, String str5, String str6) {
        try {
            HashSet<String> extractInternalLinksFromHtml = extractInternalLinksFromHtml(str5, str3);
            boolean z = extractInternalLinksFromHtml == null;
            boolean z2 = false;
            if (!z) {
                z2 = extractInternalLinksFromHtml.isEmpty();
            }
            if (!z && !z2) {
                return extractInternalLinksFromHtml;
            }
            logger.warn("No " + (z2 ? "valid" : "available") + " links were able to be retrieved from pageUrl: \"" + str3 + "\". Its contentType is: " + str6);
            contentProblematicUrls.incrementAndGet();
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in PageCrawler.retrieveInternalLinks() method, as no " + (z2 ? "valid " : "") + "links were able to be retrieved from it. Its contentType is: '" + str6 + "'", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
            if (!ConnSupportUtils.countAndBlockDomainAfterTimes(HttpConnUtils.blacklistedDomains, timesDomainNotGivingInternalLinks, str4, 200, true)) {
                return null;
            }
            logger.warn("Domain: \"" + str4 + "\" was blocked after not providing internalLinks more than 200 times.");
            return null;
        } catch (DocLinkFoundException e) {
            if (verifyDocLink(str, str2, str3, str6, e)) {
                return null;
            }
            handlePageWithNoDocOrDatasetUrls(str, str2, str3, str4, false, true);
            return null;
        } catch (DocLinkInvalidException e2) {
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.retrieveInternalLinks()' method, as there was an invalid docLink. Its contentType is: '" + str6 + "'", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
            handlePageWithNoDocOrDatasetUrls(str, str2, str3, str4, false, true);
            return null;
        } catch (DocLinkUnavailableException e3) {
            logger.warn("The docLink was not available inside pageUrl: " + str3);
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.retrieveInternalLinks()' method, as the doc-link was not available. Its contentType is: '" + str6 + "'", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
            contentProblematicUrls.incrementAndGet();
            return null;
        } catch (DynamicInternalLinksFoundException e4) {
            HttpConnUtils.blacklistedDomains.add(str4);
            logger.warn("Page: \"" + str3 + "\" left \"PageCrawler.visit()\" after found to have dynamic links. Its domain \"" + str4 + "\"  was blocked.");
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.retrieveInternalLinks()', as it belongs to a domain with dynamic-links.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
            contentProblematicUrls.incrementAndGet();
            return null;
        } catch (RuntimeException e5) {
            String message = e5.getMessage();
            String str7 = message == null ? "No reason was given!" : message;
            logger.warn(str7 + " This page was discarded.");
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.retrieveInternalLinks()' method, with reason: " + str7, null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
            contentProblematicUrls.incrementAndGet();
            return null;
        } catch (Exception e6) {
            logger.warn("Could not retrieve the internalLinks for pageUrl: " + str3);
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.retrieveInternalLinks()' method, as there was a problem retrieving its internalLinks. Its contentType is: '" + str6 + "'", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
            contentProblematicUrls.incrementAndGet();
            return null;
        }
    }

    public static HashSet<String> extractInternalLinksFromHtml(String str, String str2) throws DocLinkFoundException, DynamicInternalLinksFoundException, DocLinkInvalidException, DocLinkUnavailableException, RuntimeException {
        String fullyFormedUrl;
        Elements select = Jsoup.parse(str).select("a, link[href][type*=pdf], form[action]");
        if (select.isEmpty()) {
            return null;
        }
        HashSet<String> hashSet = new HashSet<>(select.size() / 2);
        int i = 0;
        if (str2.contains("aup-online.com")) {
            SpecialUrlsHandler.handleAupOnlinePage(str2, select);
        }
        Iterator<Element> it = select.iterator();
        while (it.hasNext()) {
            Element next = it.next();
            if (!hasUnacceptableStructure(next, str2)) {
                if (LoaderAndChecker.retrieveDocuments) {
                    String trim = next.text().trim();
                    if (trim.isEmpty() || !checkTextOrTitleAlongWithLink(next, trim)) {
                        String trim2 = next.attr("title").trim();
                        if (trim2.isEmpty() || !checkTextOrTitleAlongWithLink(next, trim2)) {
                            String trim3 = next.attr("type").trim();
                            if (!trim3.isEmpty() && ConnSupportUtils.knownDocMimeTypes.contains(trim3)) {
                                String trim4 = next.attr(LinkAttributes.HREF).trim();
                                if (trim4.isEmpty() || trim4.equals("#") || ((fullyFormedUrl = ConnSupportUtils.getFullyFormedUrl(str2, trim4, null)) != null && UrlTypeChecker.shouldNotAcceptInternalLink(fullyFormedUrl, null))) {
                                    throw new DocLinkInvalidException(trim4);
                                }
                                throw new DocLinkFoundException(StringUtils.replace(trim4, "/view/", "/download/", 1));
                            }
                        }
                    }
                }
                String trim5 = next.attr(LinkAttributes.HREF).trim();
                if (trim5.isEmpty() || trim5.equals("#")) {
                    String internalDataLink = getInternalDataLink(next);
                    trim5 = internalDataLink;
                    if (internalDataLink == null) {
                        String trim6 = next.attr("action").trim();
                        if (!trim6.isEmpty() && !trim6.equals("#") && LoaderAndChecker.DOC_URL_FILTER.matcher(trim6.toLowerCase()).matches()) {
                            String fullyFormedUrl2 = ConnSupportUtils.getFullyFormedUrl(str2, trim6, null);
                            if (fullyFormedUrl2 == null || !UrlTypeChecker.shouldNotAcceptInternalLink(fullyFormedUrl2, null)) {
                                throw new DocLinkFoundException(trim6);
                            }
                            throw new DocLinkInvalidException(trim6);
                        }
                    }
                }
                String gatherInternalLink = gatherInternalLink(trim5);
                if (gatherInternalLink != null) {
                    hashSet.add(gatherInternalLink);
                    i++;
                    if (i > 500) {
                        throw new RuntimeException("Avoid checking more than 500 internal links which were found in pageUrl \"" + str2 + "\".");
                    }
                } else {
                    continue;
                }
            }
        }
        return hashSet;
    }

    private static boolean checkTextOrTitleAlongWithLink(Element element, String str) throws DocLinkFoundException, DocLinkInvalidException {
        String lowerCase = str.toLowerCase();
        if (NON_VALID_DOCUMENT.matcher(lowerCase).matches()) {
            return true;
        }
        if (!DOCUMENT_TEXT.matcher(lowerCase).matches()) {
            return false;
        }
        String trim = element.attr(LinkAttributes.HREF).trim();
        if (trim.isEmpty() || trim.equals("#")) {
            String internalDataLink = getInternalDataLink(element);
            trim = internalDataLink;
            if (internalDataLink == null) {
                return true;
            }
        }
        if (trim.startsWith("javascript:")) {
            return true;
        }
        if (UrlTypeChecker.shouldNotAcceptInternalLink(trim, null)) {
            throw new DocLinkInvalidException(trim);
        }
        throw new DocLinkFoundException(StringUtils.replace(trim, "/view/", "/download/", 1));
    }

    private static String getInternalDataLink(Element element) {
        String str = null;
        for (Attribute attribute : element.attributes().asList()) {
            String key = attribute.getKey();
            if (key.contains("data") && !key.contains("data-follow-set")) {
                str = attribute.getValue().trim();
                if (!str.isEmpty() && !str.equals("#")) {
                    break;
                }
            }
        }
        return str;
    }

    private static boolean hasUnacceptableStructure(Element element, String str) {
        String domainStr;
        if (element.className().trim().equals("state-published")) {
            String trim = element.attr(LinkAttributes.HREF).trim();
            if (trim.startsWith(ConfigurationWatchList.HTTP_PROTOCOL_STR, 0) && (domainStr = UrlUtils.getDomainStr(trim, null)) != null && !str.contains(domainStr)) {
                return true;
            }
        }
        Element parent = element.parent();
        if (parent == null) {
            return false;
        }
        String lowerCase = parent.ownText().trim().toLowerCase();
        if (!lowerCase.isEmpty() && NON_VALID_DOCUMENT.matcher(lowerCase).matches()) {
            return true;
        }
        do {
            String trim2 = parent.tagName().trim();
            if (!trim2.isEmpty() && (trim2.equals("footer") || trim2.equals("header"))) {
                return true;
            }
            String trim3 = parent.className().trim();
            if (!trim3.isEmpty() && PARENT_CLASS_NAME_FILTER_PATTERN.matcher(trim3.toLowerCase()).matches()) {
                return true;
            }
            String id = parent.id();
            if (!id.isEmpty() && PARENT_ID_FILTER_PATTERN.matcher(id.toLowerCase()).matches()) {
                return true;
            }
            parent = parent.parent();
        } while (parent != null);
        return false;
    }

    public static String gatherInternalLink(String str) throws DynamicInternalLinksFoundException, DocLinkFoundException {
        if (str.equals("/")) {
            return null;
        }
        if (str.contains("{{") || str.contains("<?")) {
            throw new DynamicInternalLinksFoundException();
        }
        String lowerCase = str.toLowerCase();
        if (INTERNAL_LINKS_STARTING_FROM_FILTER.matcher(lowerCase).matches()) {
            return null;
        }
        if (lowerCase.contains("#")) {
            if ((LoaderAndChecker.retrieveDocuments && LoaderAndChecker.DOC_URL_FILTER.matcher(lowerCase).matches()) || (LoaderAndChecker.retrieveDatasets && LoaderAndChecker.DATASET_URL_FILTER.matcher(lowerCase).matches())) {
                return UrlUtils.removeAnchor(str);
            }
            if (!lowerCase.contains("/#/")) {
                return null;
            }
        } else if (lowerCase.contains("\"") || lowerCase.contains("[error")) {
            return null;
        }
        if (!lowerCase.startsWith("javascript:", 0)) {
            return str;
        }
        String str2 = null;
        Matcher matcher = JAVASCRIPT_DOC_LINK.matcher(str);
        if (!matcher.matches()) {
            return null;
        }
        try {
            str2 = matcher.group(1);
        } catch (Exception e) {
            logger.error("", (Throwable) e);
        }
        throw new DocLinkFoundException(str2);
    }

    public static boolean verifyDocLink(String str, String str2, String str3, String str4, DocLinkFoundException docLinkFoundException) {
        String filter;
        String message = docLinkFoundException.getMessage();
        if (message == null || message.isEmpty()) {
            logger.warn("DocLink was not retrieved!");
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving its internalLinks. Its contentType is: '" + str4 + "'", null, true, "true", "true", "false", "false", "true", null, "null", "N/A");
            return false;
        }
        String fullyFormedUrl = ConnSupportUtils.getFullyFormedUrl(str3, message, null);
        if (fullyFormedUrl == null || (filter = LoaderAndChecker.basicURLNormalizer.filter(fullyFormedUrl)) == null) {
            logger.warn("Could not normalize internal url: " + message);
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as there were normalization problems with the 'possibleDocUrl' found inside: " + message, null, true, "true", "false", "false", "false", "false", null, "null", "N/A");
            return false;
        }
        IdUrlMimeTypeTriple idUrlMimeTypeTriple = UrlUtils.docOrDatasetUrlsWithIDs.get(filter);
        if (idUrlMimeTypeTriple != null) {
            ConnSupportUtils.handleReCrossedDocUrl(str, str2, str3, filter, idUrlMimeTypeTriple, false);
            return true;
        }
        try {
            if (HttpConnUtils.connectAndCheckMimeType(str, str2, str3, filter, null, false, true)) {
                return true;
            }
            logger.warn("The DocLink < " + filter + " > was not a " + ArgsUtils.targetUrlType + " (unexpected)!");
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as the retrieved DocLink: < " + filter + " > was not a docUrl.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
            return false;
        } catch (Exception e) {
            logger.warn("The DocLink < " + filter + " > was not reached!");
            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded in 'PageCrawler.visit()' method, as the retrieved DocLink: < " + filter + " > had connectivity problems.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
            return false;
        }
    }

    public static boolean checkRemainingInternalLinks(String str, String str2, String str3, String str4, HashSet<String> hashSet) {
        int incrementAndGet = timesCheckedRemainingLinks.incrementAndGet();
        if (incrementAndGet >= 20) {
            double d = (timesFoundDocOrDatasetUrlFromRemainingLinks.get() * 100.0d) / incrementAndGet;
            if (d < leastPercentageOfHitsFromRemainingLinks) {
                logger.warn("The percentage of found docUrls from the remaining links is too low ( " + d + "% ). Stop checking the remaining-internalLinks for any pageUrl..");
                should_check_remaining_links = false;
                handlePageWithNoDocOrDatasetUrls(str, str2, str3, str4, false, false);
                return false;
            }
        }
        int i = 0;
        Iterator<String> it = hashSet.iterator();
        while (it.hasNext()) {
            String next = it.next();
            if (next.contains(str4) && !UrlUtils.duplicateUrls.contains(next)) {
                if (UrlTypeChecker.shouldNotAcceptInternalLink(next, null)) {
                    UrlUtils.duplicateUrls.add(next);
                } else {
                    i++;
                    if (i > 10) {
                        logger.warn("The maximum limit (10) of remaining links to be connected was reached for pageUrl: \"" + str3 + "\". The page was discarded.");
                        handlePageWithNoDocOrDatasetUrls(str, str2, str3, str4, true, false);
                        return false;
                    }
                    try {
                        if (HttpConnUtils.connectAndCheckMimeType(str, str2, str3, next, null, false, false)) {
                            timesFoundDocOrDatasetUrlFromRemainingLinks.incrementAndGet();
                            return true;
                        }
                        UrlUtils.duplicateUrls.add(next);
                    } catch (ConnTimeoutException e) {
                        if (next.contains(str4)) {
                            logger.warn("Page: \"" + str3 + "\" left \"PageCrawler.checkRemainingInternalLinks()\" after an internalLink caused a ConnTimeoutException.");
                            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.checkRemainingInternalLinks()' method, as an internalLink of this page caused 'ConnTimeoutException'.", null, true, "true", "true", "false", "false", "true", null, "null", "N/A");
                            LoaderAndChecker.connProblematicUrls.incrementAndGet();
                            return false;
                        }
                    } catch (DomainBlockedException e2) {
                        String message = e2.getMessage();
                        if (message == null) {
                            continue;
                        } else if (message.contains(str4)) {
                            logger.warn("Page: \"" + str3 + "\" left \"PageCrawler.checkRemainingInternalLinks()\" after it's domain was blocked.");
                            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.checkRemainingInternalLinks()' method, as its domain was blocked during crawling.", null, true, "true", "true", "false", "false", LoaderAndChecker.COULD_RETRY_URLS.matcher(str3).matches() ? "true" : "false", null, "null", "N/A");
                            LoaderAndChecker.connProblematicUrls.incrementAndGet();
                            return false;
                        }
                    } catch (DomainWithUnsupportedHEADmethodException e3) {
                        if (next.contains(str4)) {
                            logger.warn("Page: \"" + str3 + "\" left \"PageCrawler.checkRemainingInternalLinks()\" after it's domain was caught to not support the HTTP HEAD method, as a result, the internal-links will stop being checked.");
                            UrlUtils.addOutputData(str, str2, str3, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Logged in 'PageCrawler.checkRemainingInternalLinks()' method, as its domain was caught to not support the HTTP HEAD method.", null, true, "true", "true", "false", "false", "false", null, "null", "N/A");
                            LoaderAndChecker.connProblematicUrls.incrementAndGet();
                            return false;
                        }
                    } catch (RuntimeException e4) {
                    }
                }
            }
        }
        handlePageWithNoDocOrDatasetUrls(str, str2, str3, str4, false, false);
        return false;
    }

    public static void printInternalLinksForDebugging(HashSet<String> hashSet) {
        Iterator<String> it = hashSet.iterator();
        while (it.hasNext()) {
            logger.debug(it.next());
        }
    }
}
