package org.gcube.textextractor.extractors;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.gcube.textextractor.helpers.ExtractorHelper;

/* loaded from: input_file:WEB-INF/lib/smartfish-doc-processor-1.0.0-3.1.1.jar:org/gcube/textextractor/extractors/PDFExtractor.class */
public class PDFExtractor extends InformationExtractor {
    public PDFExtractor(String str) {
        super(str);
    }

    @Override // org.gcube.textextractor.extractors.InformationExtractor
    List<Map<String, String>> extractInfo() throws FileNotFoundException {
        ArrayList arrayList = new ArrayList();
        for (String str : getFilenames()) {
            try {
                FileInputStream fileInputStream = new FileInputStream(str);
                BodyContentHandler bodyContentHandler = new BodyContentHandler();
                Metadata metadata = new Metadata();
                new PDFParser().parse(fileInputStream, bodyContentHandler, metadata, new ParseContext());
                String removeEmptyLines = ExtractorHelper.removeEmptyLines(bodyContentHandler.toString());
                HashMap hashMap = new HashMap();
                hashMap.put("documentID", str);
                hashMap.put("text", removeEmptyLines);
                hashMap.put("title", metadata.get("title"));
                hashMap.put("language", new LanguageIdentifier(removeEmptyLines).getLanguage());
                arrayList.add(hashMap);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return arrayList;
    }

    @Override // org.gcube.textextractor.extractors.InformationExtractor
    public String createCustomRowset(Map<String, String> map) {
        return ExtractorHelper.createRowseFromFields(map.get("documentID"), InformationExtractor.collectionID, InformationExtractor.idxType, map.get("language"), map);
    }

    @Override // org.gcube.textextractor.extractors.InformationExtractor
    Map<String, String> enrichRecord(Map<String, String> map, String str) {
        return null;
    }
}
