/*
 * Decompiled with CFR 0.152.
 */
package eu.openaire.publications_retriever.crawler;

import eu.openaire.publications_retriever.crawler.PageCrawler;
import eu.openaire.publications_retriever.exceptions.DocLinkFoundException;
import eu.openaire.publications_retriever.exceptions.DocLinkUnavailableException;
import eu.openaire.publications_retriever.machine_learning.PageStructureMLA;
import eu.openaire.publications_retriever.models.IdUrlMimeTypeTriple;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.Strings;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SpecialUrlsHandler {
    private static final Logger logger = LoggerFactory.getLogger(SpecialUrlsHandler.class);
    private static final String europepmcPageUrlBasePath = "https://europepmc.org/backend/ptpmcrender.fcgi?accid=";
    private static final String nasaBaseDomainPath = "https://ntrs.nasa.gov/";
    private static final String ieeexploreBasePath = "https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=";
    public static Pattern Turkjgastroenterol_docUrl_pattern = Pattern.compile("<div[\\s]*>[\\s]*(/content/files/[^<>]+.pdf)[\\s]*</div>");
    private static final String ijcseonlineBaseUrl = "https://www.ijcseonline.org/pub_paper/";
    private static final Pattern IJCSEONLINE_PDF_FILENAME = Pattern.compile(".+/[^/]+&(.+)$");
    private static final Pattern ONLINELIBRARY_WILEY = Pattern.compile("(?:http[s]?)://[^/]*onlinelibrary.wiley.com/([^/]+/)?doi/.*");
    private static final Pattern DOI_URL_WITH_INNER_LINK = Pattern.compile("http[s]?://(?:dx.)?doi.org/(http.*)");

    public static String checkAndHandleSpecialUrls(String resourceUrl) throws RuntimeException {
        String updatedUrl = null;
        updatedUrl = SpecialUrlsHandler.checkAndGetEuropepmcDocUrl(resourceUrl);
        resourceUrl = updatedUrl != null ? updatedUrl : ((updatedUrl = SpecialUrlsHandler.checkAndDowngradeManuscriptElsevierUrl(resourceUrl)) != null ? updatedUrl : ((updatedUrl = SpecialUrlsHandler.checkAndGetNasaDocUrl(resourceUrl)) != null ? updatedUrl : ((updatedUrl = SpecialUrlsHandler.checkAndGetFrontiersinDocUrl(resourceUrl)) != null ? updatedUrl : ((updatedUrl = SpecialUrlsHandler.checkAndHandlePsyarxiv(resourceUrl)) != null ? updatedUrl : ((updatedUrl = SpecialUrlsHandler.checkAndHandleIjcseonlinePage(resourceUrl)) != null ? updatedUrl : ((updatedUrl = SpecialUrlsHandler.checkAndHandleIeeeExplorer(resourceUrl)) != null ? updatedUrl : ((updatedUrl = SpecialUrlsHandler.checkAndHandleOSFurls(resourceUrl)) != null ? updatedUrl : ((updatedUrl = SpecialUrlsHandler.checkAndHandleWileyUrls(resourceUrl)) != null ? updatedUrl : ((updatedUrl = SpecialUrlsHandler.checkAndHandleScieloUrls(resourceUrl)) != null ? updatedUrl : ((updatedUrl = SpecialUrlsHandler.checkAndHandleDoiUrlsWithInnerLinks(resourceUrl)) != null ? updatedUrl : SpecialUrlsHandler.checkAndHandleDergipark(resourceUrl)))))))))));
        return resourceUrl;
    }

    public static String checkAndGetEuropepmcDocUrl(String europepmcUrl) {
        if (europepmcUrl.contains("europepmc.org") && !europepmcUrl.contains("ptpmcrender.fcgi")) {
            String idStr = UrlUtils.getDocIdStr(europepmcUrl, null);
            if (idStr != null) {
                return europepmcPageUrlBasePath + (String)(!idStr.startsWith("PMC", 0) ? "PMC" + idStr : idStr) + "&blobtype=pdf";
            }
            return europepmcUrl;
        }
        return null;
    }

    public static String checkAndDowngradeManuscriptElsevierUrl(String manuscriptElsevierUrl) {
        if (manuscriptElsevierUrl.contains("manuscript.elsevier.com")) {
            manuscriptElsevierUrl = Strings.CS.replace(manuscriptElsevierUrl, "https", "http", 1);
            return manuscriptElsevierUrl;
        }
        return null;
    }

    public static String checkAndGetNasaDocUrl(String nasaPageUrl) {
        if (nasaPageUrl.contains("ntrs.nasa.gov/citations") && !nasaPageUrl.contains("api/")) {
            String idStr = UrlUtils.getDocIdStr(nasaPageUrl, null);
            if (idStr == null) {
                return nasaPageUrl;
            }
            Object citationPath = Strings.CS.replace(nasaPageUrl, nasaBaseDomainPath, "", 1);
            citationPath = ((String)citationPath).endsWith("/") ? citationPath : (String)citationPath + "/";
            return "https://ntrs.nasa.gov/api/" + (String)citationPath + "downloads/" + idStr + ".pdf";
        }
        return null;
    }

    public static String checkAndGetFrontiersinDocUrl(String frontiersinPageUrl) {
        if (frontiersinPageUrl.contains("www.frontiersin.org")) {
            if (frontiersinPageUrl.endsWith("/pdf")) {
                return frontiersinPageUrl;
            }
            if (!frontiersinPageUrl.contains("/article")) {
                throw new RuntimeException("This \"frontiersin\"-url is known to not lead to a docUrl: " + frontiersinPageUrl);
            }
            String idStr = UrlUtils.getDocIdStr(frontiersinPageUrl, null);
            if (idStr == null) {
                return frontiersinPageUrl;
            }
            if (frontiersinPageUrl.endsWith("/full")) {
                return Strings.CS.replace(frontiersinPageUrl, "/full", "/pdf");
            }
            return frontiersinPageUrl + "/pdf";
        }
        return null;
    }

    public static String checkAndHandlePsyarxiv(String pageUrl) {
        if (pageUrl.contains("psyarxiv.com")) {
            if (!pageUrl.contains("/download")) {
                return pageUrl + (pageUrl.endsWith("/") ? "download" : "/download");
            }
            return pageUrl;
        }
        return null;
    }

    public static String checkAndHandleDergipark(String pageUrl) {
        return Strings.CS.replace(pageUrl, "dergipark.gov.tr", "dergipark.org.tr");
    }

    public static boolean extractAndCheckTurkjgastroenterolDocUrl(String pageHtml, String urlId, String sourceUrl, String pageUrl, String pageDomain) {
        Matcher matcher = Turkjgastroenterol_docUrl_pattern.matcher(pageHtml);
        if (!matcher.find()) {
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving the \"turkjgastroenterol\"-pdf-url from its html.", "null", pageDomain, true, "true", "true", "false", "false", "false", null, "null", "null");
            return false;
        }
        String pdfUrl = null;
        try {
            pdfUrl = matcher.group(1);
        }
        catch (Exception e) {
            logger.warn("No pdf-url was found inside the html of page: " + pageUrl, e);
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving the \"turkjgastroenterol\"-pdf-url from its html.", "null", pageDomain, true, "true", "true", "false", "false", "false", null, "null", "null");
            PageCrawler.contentProblematicUrls.incrementAndGet();
            return false;
        }
        if (pdfUrl == null || pdfUrl.isEmpty()) {
            logger.warn("No pdf-url was found inside the html of page: " + pageUrl);
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.visit()' method, as there was a problem retrieving the \"turkjgastroenterol\"-pdf-url from its html.", "null", pageDomain, true, "true", "true", "false", "false", "false", null, "null", "null");
            PageCrawler.contentProblematicUrls.incrementAndGet();
            return false;
        }
        String urlToCheck = pdfUrl;
        urlToCheck = ConnSupportUtils.getFullyFormedUrl(pageUrl, pdfUrl, null);
        if (urlToCheck == null || (urlToCheck = LoaderAndChecker.basicURLNormalizer.filter(urlToCheck)) == null) {
            logger.warn("Could not normalize url: " + pdfUrl);
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", "Discarded in 'PageCrawler.visit()' method, as the retrievied \"turkjgastroenterol\"-pdf-url had normalization's problems.", "null", pageDomain, true, "true", "true", "false", "false", "false", null, "null", "null");
            LoaderAndChecker.connProblematicUrls.incrementAndGet();
            return false;
        }
        if ((urlToCheck = LoaderAndChecker.handleUrlChecks(urlId, urlToCheck)) == null) {
            return false;
        }
        IdUrlMimeTypeTriple originalIdUrlMimeTypeTriple = UrlUtils.resultUrlsWithIDs.get(urlToCheck);
        if (originalIdUrlMimeTypeTriple != null) {
            ConnSupportUtils.handleReCrossedTargetUrl(urlId, urlToCheck, urlToCheck, urlToCheck, originalIdUrlMimeTypeTriple, true);
            return false;
        }
        boolean isPossibleDocOrDatasetUrl = true;
        try {
            HttpConnUtils.connectAndCheckMimeType(urlId, urlToCheck, urlToCheck, urlToCheck, null, true, isPossibleDocOrDatasetUrl);
        }
        catch (Exception e) {
            List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e, urlToCheck);
            String wasValid = list.get(0);
            String couldRetry = list.get(1);
            String errorMsg = "Discarded in 'PageCrawler.visit()' method, as there was a problem in checking the retrieved 'turkjgastroenterol'-pdf-url: " + list.get(2);
            UrlUtils.addOutputData(urlId, sourceUrl, pageUrl, "unreachable", errorMsg, "null", pageDomain, true, "true", wasValid, "false", "false", couldRetry, null, "null", "null");
            return false;
        }
        return true;
    }

    public static void handleAupOnlinePage(String pageUrl, Elements elementLinksOnPage) throws DocLinkFoundException, DocLinkUnavailableException {
        for (Element el : elementLinksOnPage) {
            String possibleDocUrl;
            if (!el.attr("data-title").contains("ownload") || (possibleDocUrl = el.attr("action").trim()).isEmpty()) continue;
            throw new DocLinkFoundException(possibleDocUrl, PageStructureMLA.getPageTagAndClassStructureForElement(el), false);
        }
        throw new DocLinkUnavailableException("No docUrl was found inside a form-element, for \"aup-online.com\" pageUrl: " + pageUrl);
    }

    public static String checkAndHandleIjcseonlinePage(String pageUrl) {
        if (!pageUrl.contains("www.ijcseonline.org")) {
            return null;
        }
        if (!pageUrl.contains("pdf_paper_view.php")) {
            return pageUrl;
        }
        String pdfFileName = null;
        try {
            Matcher matcher = IJCSEONLINE_PDF_FILENAME.matcher(pageUrl);
            if (!matcher.matches()) {
                return pageUrl;
            }
            pdfFileName = matcher.group(1);
            if (pdfFileName == null || pdfFileName.isEmpty()) {
                logger.error("No pdf-file-name was extracted from pageUrl: " + pageUrl);
                return pageUrl;
            }
        }
        catch (Exception e) {
            logger.error("", e);
            return pageUrl;
        }
        return ijcseonlineBaseUrl + pdfFileName;
    }

    public static String checkAndHandleIeeeExplorer(String pageUrl) {
        if (pageUrl.contains("ieeexplore.ieee.org")) {
            if (pageUrl.contains("/stampPDF/")) {
                return pageUrl;
            }
            String idStr = UrlUtils.getDocIdStr(pageUrl, null);
            if (idStr == null) {
                return pageUrl;
            }
            return ieeexploreBasePath + idStr;
        }
        return null;
    }

    public static String checkAndHandleOSFurls(String pageUrl) {
        if (!((String)pageUrl).contains("://osf.io")) {
            return null;
        }
        if (((String)pageUrl).contains("/download")) {
            return pageUrl;
        }
        if (!((String)pageUrl).endsWith("/")) {
            pageUrl = (String)pageUrl + "/";
        }
        return (String)pageUrl + "download";
    }

    public static String checkAndHandleWileyUrls(String pageUrl) {
        Matcher matcher = ONLINELIBRARY_WILEY.matcher(pageUrl);
        if (!matcher.matches()) {
            if (pageUrl.contains("api.wiley.com/onlinelibrary")) {
                String docIdStr = UrlUtils.getDocIdStr(pageUrl, null);
                return docIdStr != null ? "https://onlinelibrary.wiley.com/doi/pdfdirect/" + docIdStr + "?download=true" : null;
            }
            return null;
        }
        String subJournal = matcher.group(1);
        if (subJournal != null && !subJournal.isEmpty()) {
            pageUrl = Strings.CS.replace(pageUrl, subJournal, "");
        }
        if (pageUrl.contains("/pdfdirect/")) {
            return pageUrl.contains("download=true") ? pageUrl : pageUrl + (pageUrl.contains("?") ? "&" : "?") + "download=true";
        }
        if (pageUrl.endsWith("/abstract")) {
            pageUrl = Strings.CS.replace(pageUrl, "/abstract", "");
        } else if (pageUrl.endsWith("/fullpdf")) {
            pageUrl = Strings.CS.replace(pageUrl, "/fullpdf", "");
        }
        pageUrl = pageUrl.contains("epdf/") ? Strings.CS.replace(pageUrl, "epdf/", "pdfdirect/", 1) : (pageUrl.contains("pdf/") ? Strings.CS.replace(pageUrl, "pdf/", "pdfdirect/", 1) : (pageUrl.contains("full/") ? Strings.CS.replace(pageUrl, "full/", "pdfdirect/", 1) : (pageUrl.contains("abs/") ? Strings.CS.replace(pageUrl, "/doi/abs/", "/doi/pdfdirect/", 1) : (pageUrl.contains("full-xml/") ? Strings.CS.replace(pageUrl, "/full-xml/", "/full/", 1) : Strings.CS.replace(pageUrl, "/doi/", "/doi/pdfdirect/", 1)))));
        return pageUrl.contains("download=true") ? pageUrl : pageUrl + (pageUrl.contains("?") ? "&" : "?") + "download=true";
    }

    public static String checkAndHandleEmbopressUrls(String pageUrl) {
        if (!pageUrl.contains("://www.embopress.org")) {
            return null;
        }
        if (pageUrl.contains("/pdf/")) {
            return pageUrl;
        }
        if (pageUrl.contains("/pdfdirect/")) {
            return Strings.CS.replace(pageUrl, "/pdfdirect/", "/pdf/", 1);
        }
        return pageUrl;
    }

    public static String checkAndHandleScieloUrls(String pageUrl) {
        if (!pageUrl.contains("scielo.br")) {
            return null;
        }
        return Strings.CS.replace(pageUrl, "amp;", "&");
    }

    public static String checkAndHandleDoiUrlsWithInnerLinks(String weirdDoiUrl) {
        Matcher matcher = DOI_URL_WITH_INNER_LINK.matcher(weirdDoiUrl);
        if (matcher.matches()) {
            String innerLink = matcher.group(1);
            if (innerLink != null && !innerLink.isEmpty()) {
                return Strings.CS.replace(innerLink, ":/", "://");
            }
            logger.warn("Could not extract he inner-link from weird-doi-url: " + weirdDoiUrl);
            return weirdDoiUrl;
        }
        return null;
    }
}

