package gr.forth.ics.isl.xlink.textextractor;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import gr.forth.ics.isl.xlink.exceptions.FalseFileTypeException;
import gr.forth.ics.isl.xlink.util.HTMLTag;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.logging.Level;
import java.util.logging.Logger;

/* loaded from: input_file:gr/forth/ics/isl/xlink/textextractor/WebPageTextExtractor.class */
public class WebPageTextExtractor implements TextExtractor {
    private String webPageUrl;

    public WebPageTextExtractor(String str) {
        this.webPageUrl = str;
    }

    @Override // gr.forth.ics.isl.xlink.textextractor.TextExtractor
    public String extractText() throws FalseFileTypeException {
        String str = "";
        URL url = null;
        try {
            url = new URL(this.webPageUrl);
        } catch (MalformedURLException e) {
            Logger.getLogger(WebPageTextExtractor.class.getName()).log(Level.SEVERE, (String) null, (Throwable) e);
        }
        URLConnection uRLConnection = null;
        try {
            uRLConnection = url.openConnection();
        } catch (IOException e2) {
            Logger.getLogger(WebPageTextExtractor.class.getName()).log(Level.SEVERE, (String) null, (Throwable) e2);
        }
        if (uRLConnection.getContentType().equalsIgnoreCase("application/pdf")) {
            System.out.println("# Reading PDF file!");
            PdfReader pdfReader = null;
            try {
                pdfReader = new PdfReader(url);
            } catch (IOException e3) {
                Logger.getLogger(WebPageTextExtractor.class.getName()).log(Level.SEVERE, (String) null, (Throwable) e3);
            }
            int numberOfPages = pdfReader.getNumberOfPages();
            for (int i = 1; i <= numberOfPages; i++) {
                try {
                    str = str + PdfTextExtractor.getTextFromPage(pdfReader, i) + "\n";
                } catch (IOException e4) {
                    Logger.getLogger(WebPageTextExtractor.class.getName()).log(Level.SEVERE, (String) null, (Throwable) e4);
                }
            }
            pdfReader.close();
        } else if (uRLConnection.getContentType().equalsIgnoreCase("content/unknown")) {
            if (url.getPath().toLowerCase().endsWith("doc") || url.getPath().toLowerCase().endsWith("docx")) {
                System.out.println("# Reading MSWORD file!");
                str = new WordTextExtractor(url.getPath()).extractText();
            } else if (url.getPath().toLowerCase().endsWith("ppt") || url.getPath().toLowerCase().endsWith("pptx")) {
                System.out.println("# Reading MSPOWERPOINT file!");
                str = new PowerPointTextExtractor(url.getPath()).extractText();
            } else {
                System.out.println("to-do");
            }
        } else if (uRLConnection.getContentType().equalsIgnoreCase("text/plain")) {
            System.out.println("# Reading txt file!");
            str = new TXTTextExtractor(url.getPath()).extractText();
        } else if (uRLConnection.getContentType().equalsIgnoreCase("application/xml")) {
            System.out.println("# Reading xml file!");
            str = new XMLbasedTextExtractor(url.getPath()).extractText();
        } else {
            str = new HTMLTag(url).getSourceCode();
        }
        return str;
    }
}
