package eu.dnetlib.data.mapreduce.hbase.dedup.experiment;

import com.google.common.base.Splitter;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.dom4j.Document;
import org.dom4j.Element;

/* loaded from: input_file:WEB-INF/lib/dnet-mapreduce-jobs-1.2.2.jar:eu/dnetlib/data/mapreduce/hbase/dedup/experiment/SubjectParser.class */
public class SubjectParser {
    public static final String REGEX_SUBJECT = "^(info:eu-repo)\\/(classification)\\/([a-zA-Z]*)\\/(.*)$";
    private static final int MIN_LENGTH = 5;

    public SubjectsMap parse(Document document) {
        List selectNodes = document.selectNodes("//*[local-name() = 'subject']");
        SubjectsMap subjectsMap = new SubjectsMap();
        for (int i = 0; i < selectNodes.size(); i++) {
            String text = ((Element) selectNodes.get(i)).getText();
            String guessType = guessType(text);
            if (!subjectsMap.containsKey(guessType)) {
                subjectsMap.put(guessType, new Subjects());
            }
            if (StringUtils.isNotBlank(guessType)) {
                if ("keyword".equals(guessType)) {
                    Iterator<String> it = Splitter.on(",").trimResults().omitEmptyStrings().split(text).iterator();
                    while (it.hasNext()) {
                        String lowerCase = it.next().replaceAll("[^a-zA-Z ]", "").toLowerCase();
                        if (lowerCase.length() >= 5) {
                            subjectsMap.get(guessType).add(lowerCase);
                        }
                    }
                } else {
                    String replaceFirst = text.replaceFirst(REGEX_SUBJECT, "$4");
                    if (StringUtils.isNotBlank(replaceFirst)) {
                        String lowerCase2 = replaceFirst.replaceAll("[^a-zA-Z ]", "").toLowerCase();
                        if (lowerCase2.length() >= 5) {
                            subjectsMap.get(guessType).add(lowerCase2);
                        }
                    }
                }
            }
        }
        return subjectsMap;
    }

    private String guessType(String str) {
        return str.startsWith("info:eu-repo") ? str.replaceAll(REGEX_SUBJECT, "$3") : "keyword";
    }
}
