package ws.palladian.preprocessing.segmentation;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactoryConfigurationError;
import org.apache.commons.configuration.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.SAXException;
import ws.palladian.extraction.content.PageContentExtractorException;
import ws.palladian.extraction.token.Tokenizer;
import ws.palladian.helper.ConfigHolder;
import ws.palladian.helper.collection.Bag;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.math.SetSimilarities;
import ws.palladian.preprocessing.segmentation.Segment;
import ws.palladian.retrieval.DocumentRetriever;
import ws.palladian.retrieval.PageAnalyzer;

/* loaded from: input_file:ws/palladian/preprocessing/segmentation/PageSegmenter.class */
public class PageSegmenter {
    private static final int DEFAULT_LENGTH_OF_Q_GRAMS = 9;
    private static final int DEFAULT_NUM_SIMILAR_DOCUMENTS = 5;
    private Document document;
    private String storeLocation;
    private List<Segment> segments;
    private List<Document> similarFiles;
    private static final Logger LOGGER = LoggerFactory.getLogger(PageSegmenter.class);
    private static int lengthOfQGrams = 9;
    private static final int DEFAULT_AMOUNT_OF_Q_GRAMS = 5000;
    private static int amountOfQGrams = DEFAULT_AMOUNT_OF_Q_GRAMS;
    private static final double DEFAULT_SIMILARITY_NEED = 0.689d;
    private static double similarityNeed = DEFAULT_SIMILARITY_NEED;
    private static final int DEFAULT_MAX_DEPTH = 100;
    private static int maxDepth = DEFAULT_MAX_DEPTH;
    private static int numberOfSimilarDocuments = 5;

    public PageSegmenter() {
        this.document = null;
        this.storeLocation = "";
        this.segments = null;
        this.similarFiles = null;
        loadConfig();
    }

    public PageSegmenter(Document document) {
        this(document, "");
    }

    public PageSegmenter(Document document, String str) {
        this.document = null;
        this.storeLocation = "";
        this.segments = null;
        this.similarFiles = null;
        loadConfig();
        this.document = document;
        this.storeLocation = str;
    }

    public void colorSegments() {
        colorSegments(this.segments, true);
    }

    public void colorSegments(List<?> list, Boolean bool) {
        List<?> arrayList = new ArrayList();
        String[] strArr = {"#ff0000", "#ff9600", "#ffc800", "#ffff00", "#e6ff00", "#c8ff00", "green"};
        Element createElement = this.document.createElement("style");
        createElement.setAttribute("type", "text/css");
        String str = "\n.myPageSegmenterBorder_dummy_NOTINUSE { border: 2px solid blue; }\n";
        for (int i = 0; i < strArr.length; i++) {
            str = bool.booleanValue() ? str + ".myPageSegmenterBorder" + i + " { border: 2px solid " + strArr[i] + "; }\n" : str + ".myPageSegmenterBorder" + i + " { background-color: " + strArr[i] + "; }\n";
        }
        Text createTextNode = this.document.createTextNode(str);
        createElement.appendChild(createTextNode);
        if (this.document.getElementsByTagName("style").getLength() != 0) {
            this.document.getElementsByTagName("style").item(0).appendChild(createTextNode);
        } else {
            this.document.getElementsByTagName("head").item(0).appendChild(createElement);
        }
        LOGGER.info(list.get(0).getClass().getSimpleName());
        LOGGER.info("{}", list.get(0));
        if (list.get(0).getClass().getSimpleName().equals("String")) {
            LOGGER.info("... War ein String");
            ArrayList arrayList2 = new ArrayList();
            for (int i2 = 0; i2 < this.segments.size(); i2++) {
                if (list.contains(this.segments.get(i2).getXPath())) {
                    arrayList2.add(this.segments.get(i2));
                }
            }
            arrayList = arrayList2;
        }
        if (list.get(0).getClass().getSimpleName().equals("Segment")) {
            arrayList = list;
        }
        for (int i3 = 0; i3 < arrayList.size(); i3++) {
            Segment segment = (Segment) arrayList.get(i3);
            LOGGER.info(segment.getVariability() + " " + segment.getColor() + " " + segment.getXPath());
            Element element = (Element) XPathHelper.getXhtmlNode(this.document, segment.getXPath());
            Segment.Color color = segment.getColor();
            String str2 = color == Segment.Color.RED ? "myPageSegmenterBorder0" : "";
            if (color == Segment.Color.LIGHTRED) {
                str2 = "myPageSegmenterBorder1";
            }
            if (color == Segment.Color.REDYELLOW) {
                str2 = "myPageSegmenterBorder2";
            }
            if (color == Segment.Color.YELLOW) {
                str2 = "myPageSegmenterBorder3";
            }
            if (color == Segment.Color.GREENYELLOW) {
                str2 = "myPageSegmenterBorder4";
            }
            if (color == Segment.Color.LIGHTGREEN) {
                str2 = "myPageSegmenterBorder5";
            }
            if (color == Segment.Color.GREEN) {
                str2 = "myPageSegmenterBorder6";
            }
            element.setAttribute("class", str2 + " " + element.getAttribute("class"));
        }
        if (this.storeLocation != "") {
            HtmlHelper.writeToFile(this.document, new File(this.storeLocation));
        }
        String text = new DocumentRetriever().getText(this.storeLocation);
        String str3 = this.storeLocation.substring(0, this.storeLocation.length() - 5) + "_test.html";
        for (String str4 : new String[]{"script", "iframe", "textarea"}) {
            int i4 = 0;
            while (i4 < text.length()) {
                int indexOf = text.indexOf("<" + str4 + " ", i4);
                int indexOf2 = text.indexOf("/>", indexOf);
                int indexOf3 = text.indexOf("</" + str4 + ">", indexOf);
                if (indexOf2 == -1) {
                    indexOf2 = text.length();
                }
                if (indexOf3 == -1) {
                    indexOf3 = text.length();
                }
                if (indexOf2 < indexOf3 && indexOf != -1) {
                    text = text.substring(0, indexOf2) + "></" + str4 + "><!--fixedForPageSegmenter-->" + text.substring(indexOf2 + 2, text.length());
                }
                i4 = indexOf2 <= indexOf3 ? indexOf2 : indexOf3;
                if (indexOf == -1) {
                    i4 = text.length();
                }
            }
        }
        try {
            FileOutputStream fileOutputStream = new FileOutputStream(str3);
            for (int i5 = 0; i5 < text.length(); i5++) {
                fileOutputStream.write((byte) text.charAt(i5));
            }
            fileOutputStream.close();
        } catch (FileNotFoundException e) {
            LOGGER.error("FileNotFoundException for {}", str3, e);
        } catch (IOException e2) {
            LOGGER.error("IOException for {}", str3, e2);
        }
    }

    public void colorSegments(Segment.Color color) {
        colorSegments(getSpecificSegments(color), true);
    }

    /* JADX WARN: Multi-variable type inference failed */
    private List<List<String>>[] compareDocuments(Document document, Document document2, List<String> list, List<String> list2, int i, String str) {
        NodeList childNodes = document.getFirstChild().getChildNodes();
        NodeList childNodes2 = document2.getFirstChild().getChildNodes();
        for (int i2 = 0; i2 < childNodes.getLength(); i2++) {
            Node item = childNodes.item(i2);
            if (item.getTextContent().length() != 0) {
                Node createElement = document.createElement("newnode");
                createElement.setNodeValue("###");
                createElement.setTextContent("#####");
                if (childNodes2.getLength() > i2) {
                    createElement = childNodes2.item(i2);
                }
                String constructXPath = PageAnalyzer.constructXPath(item);
                if (constructXPath.contains("/")) {
                    constructXPath = constructXPath.substring(constructXPath.indexOf("/") + 1, constructXPath.length());
                }
                if (!constructXPath.contains("/")) {
                    constructXPath = "";
                }
                if (constructXPath.contains("/")) {
                    constructXPath = constructXPath.substring(constructXPath.indexOf("/"), constructXPath.length());
                }
                if (constructXPath.contains("#")) {
                    constructXPath = "";
                }
                String str2 = str + constructXPath;
                if (!item.getTextContent().equals(createElement.getTextContent())) {
                    if (!list.contains(str2)) {
                        list.add(str2);
                        list2.remove(str2);
                    }
                    if (item.hasChildNodes() && createElement.hasChildNodes()) {
                        Document transformNodeToDocument = PageSegmenterHelper.transformNodeToDocument(item);
                        Document transformNodeToDocument2 = PageSegmenterHelper.transformNodeToDocument(createElement);
                        if (i >= 0) {
                            compareDocuments(transformNodeToDocument, transformNodeToDocument2, list, list2, i - 1, str2);
                        }
                    }
                } else if (!list2.contains(str2) && !list.contains(str2)) {
                    list2.add(str2);
                }
            }
        }
        return new List[]{list, list2};
    }

    public Bag<String> createFingerprint(Document document, int i, int i2) {
        String xmlToString = HtmlHelper.xmlToString(document, false);
        StringBuilder sb = new StringBuilder();
        Iterator<String> it = PageSegmenterHelper.listTags(xmlToString).iterator();
        while (it.hasNext()) {
            sb.append(" ").append(it.next());
        }
        Bag create = Bag.create(Tokenizer.calculateWordNGramsAsList(sb.toString(), i2));
        Bag<String> create2 = Bag.create();
        for (Map.Entry entry : create.unique()) {
            create2.add(entry.getKey(), ((Integer) entry.getValue()).intValue());
        }
        return create2;
    }

    public List<Segment> findMainSegments(List<Segment> list) {
        LOGGER.info("biggest-begin: " + list.size());
        int i = 0;
        while (i < list.size()) {
            Segment segment = list.get(i);
            boolean z = false;
            Node node = segment.getNode();
            while (node.getParentNode() != null && !z) {
                node = node.getParentNode();
                int i2 = 0;
                while (true) {
                    if (i2 < list.size()) {
                        Segment segment2 = list.get(i2);
                        if (segment2.getNode().isSameNode(node) && !segment2.equals(segment) && segment2.getColor().equals(segment.getColor())) {
                            list.remove(segment);
                            i--;
                            z = true;
                            break;
                        }
                        i2++;
                    }
                }
            }
            i++;
        }
        int i3 = 0;
        while (i3 < list.size()) {
            Segment segment3 = list.get(i3);
            if (segment3.getNode().getTextContent().length() < 50) {
                list.remove(segment3);
                i3--;
            }
            i3++;
        }
        for (int i4 = 0; i4 < list.size(); i4++) {
            LOGGER.info(list.get(i4).getVariability() + " " + list.get(i4).getXPath());
        }
        LOGGER.info("biggest-end: " + list.size());
        return list;
    }

    private List<Document> findSimilarFiles(Document document, int i, int i2, double d, int i3) {
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        DocumentRetriever documentRetriever = new DocumentRetriever();
        HashSet<String> hashSet = new HashSet();
        HashSet hashSet2 = new HashSet();
        hashSet.addAll(HtmlHelper.getLinks(document, true, false, ""));
        LOGGER.info("Anzahl Links: " + hashSet.size());
        int i4 = 0;
        for (String str : hashSet) {
            if (i4 % (hashSet.size() / 10) == 0) {
                hashSet2.add(str);
                LOGGER.info("added1: " + str);
            }
            i4++;
        }
        HashSet hashSet3 = new HashSet();
        String documentURI = document.getDocumentURI();
        Boolean bool = true;
        while (bool.booleanValue()) {
            hashSet2.add(documentURI);
            LOGGER.info("added2: " + documentURI);
            int lastIndexOf = documentURI.lastIndexOf("/");
            if (documentURI.substring(lastIndexOf - 1, lastIndexOf).equals("/")) {
                bool = false;
            } else {
                documentURI = documentURI.substring(0, lastIndexOf);
            }
        }
        Iterator it = documentRetriever.getWebDocuments(hashSet2).iterator();
        while (it.hasNext()) {
            hashSet3.addAll(HtmlHelper.getLinks((Document) it.next(), true, false, ""));
        }
        hashSet3.remove(document.getDocumentURI());
        String labelOfURL = PageSegmenterHelper.getLabelOfURL(document.getDocumentURI());
        LOGGER.info("label: " + labelOfURL);
        ArrayList arrayList = new ArrayList(hashSet3);
        int size = arrayList.size();
        int i5 = 0;
        int i6 = 0;
        while (i6 < size) {
            String str2 = (String) arrayList.get(i6);
            String labelOfURL2 = PageSegmenterHelper.getLabelOfURL(str2);
            if ((!labelOfURL.equals(labelOfURL2) && labelOfURL.matches("^[0-9]+$") && !labelOfURL2.matches("^[0-9]+$")) || (!labelOfURL.equals(labelOfURL2) && !labelOfURL.matches("^[0-9]+$"))) {
                arrayList.remove(i6);
                i6--;
                arrayList.add(size - 1, str2);
            }
            i5++;
            if (i5 == size) {
                break;
            }
            i6++;
        }
        Bag<String> createFingerprint = createFingerprint(document, i, i2);
        DocumentRetriever documentRetriever2 = new DocumentRetriever();
        Iterator it2 = arrayList.iterator();
        while (true) {
            if (!it2.hasNext()) {
                break;
            }
            HashSet hashSet4 = new HashSet();
            for (int i7 = 0; it2.hasNext() && i7 < 10; i7++) {
                hashSet2.add(it2.next());
            }
            for (Document document2 : documentRetriever2.getWebDocuments(hashSet4)) {
                if (!HtmlHelper.documentToReadableText(document).equals(HtmlHelper.documentToReadableText(document2))) {
                    Bag<String> createFingerprint2 = createFingerprint(document2, i, i2);
                    Double valueOf = Double.valueOf(SimilarityCalculator.calculateSimilarity(createFingerprint, createFingerprint2));
                    Double valueOf2 = Double.valueOf(SetSimilarities.JACCARD.getSimilarity(createFingerprint.uniqueItems(), createFingerprint2.uniqueItems()));
                    String d2 = Double.valueOf((1.0d - valueOf.doubleValue()) * 100.0d).toString();
                    String substring = d2.substring(0, Math.min(5, d2.length()));
                    double doubleValue = ((1.0d - valueOf.doubleValue()) + valueOf2.doubleValue()) / 2.0d;
                    if (doubleValue < d || doubleValue >= 1.0d) {
                        LOGGER.info("Unterschied zu groß. Seiten verwenden wahrscheinlich nicht dasselbe Template. (" + substring + "%, Jaccard=" + valueOf2 + ")");
                    } else {
                        linkedHashMap.put(document2, Double.valueOf(doubleValue));
                        LOGGER.info("Seiten verwenden wahrscheinlich dasselbe Template. (" + substring + "%, Jaccard=" + valueOf2 + ")----------" + linkedHashMap.size());
                    }
                    LOGGER.info("----------------------------------------------------------------------------------------");
                    if (linkedHashMap.size() >= i3) {
                        break;
                    }
                } else {
                    LOGGER.info("#####################################################");
                }
            }
            if (linkedHashMap.size() >= i3) {
                LOGGER.info("---Erg.: " + linkedHashMap);
                break;
            }
        }
        return new ArrayList(CollectionHelper.sortByValue(linkedHashMap).keySet());
    }

    private List<Segment> generateListOfSegments(Document document, Map<String, Double> map, List<String> list) {
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < list.size(); i++) {
            String str = list.get(i);
            Element element = (Element) XPathHelper.getXhtmlNode(document, str);
            if (element != null) {
                arrayList.add(new Segment(document, str, element, Integer.valueOf(PageSegmenterHelper.getNodeLevel(element)), Double.valueOf(0.0d)));
            }
        }
        for (Map.Entry<String, Double> entry : map.entrySet()) {
            String key = entry.getKey();
            Double value = entry.getValue();
            Element element2 = (Element) XPathHelper.getXhtmlNode(document, key);
            if (element2 != null) {
                arrayList.add(new Segment(document, key, element2, Integer.valueOf(PageSegmenterHelper.getNodeLevel(element2)), Double.valueOf(1.0d - value.doubleValue())));
            }
        }
        return arrayList;
    }

    public List<Segment> getAllSegments() {
        return this.segments;
    }

    public List<String> getAllXPaths() {
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < this.segments.size(); i++) {
            arrayList.add(this.segments.get(i).getXPath());
        }
        return arrayList;
    }

    public List<Document> getSimilarFiles() {
        return this.similarFiles;
    }

    public List<Segment> getSpecificSegments(double d, double d2) {
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < this.segments.size(); i++) {
            Segment segment = this.segments.get(i);
            if (segment.getVariability().doubleValue() >= d && segment.getVariability().doubleValue() <= d2) {
                arrayList.add(segment);
            }
        }
        return arrayList;
    }

    public List<Segment> getSpecificSegments(Segment.Color color) {
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < this.segments.size(); i++) {
            Segment segment = this.segments.get(i);
            if (segment.getColor() == color) {
                arrayList.add(segment);
            }
        }
        return arrayList;
    }

    public final void loadConfig() {
        Configuration config = ConfigHolder.getInstance().getConfig();
        lengthOfQGrams = config.getInt("pageSegmentation.lengthOfQGrams", 9);
        amountOfQGrams = config.getInt("pageSegmentation.amountOfQGrams", DEFAULT_AMOUNT_OF_Q_GRAMS);
        similarityNeed = config.getDouble("pageSegmentation.similarityNeed", DEFAULT_SIMILARITY_NEED);
        maxDepth = config.getInt("pageSegmentation.maxDepth", DEFAULT_MAX_DEPTH);
        numberOfSimilarDocuments = config.getInt("pageSegmentation.numberOfSimilarDocuments", 5);
    }

    public List<String> makeMutual(List<Segment> list, int i) {
        ArrayList arrayList = new ArrayList();
        HashSet hashSet = new HashSet();
        for (int i2 = 0; i2 < list.size(); i2++) {
            hashSet.add(list.get(i2).getXPath());
        }
        for (int i3 = 0; i3 < i; i3++) {
            String makeMutualXPath = PageAnalyzer.makeMutualXPath(hashSet);
            LOGGER.info("mutual: " + makeMutualXPath);
            String str = makeMutualXPath;
            LOGGER.info(str.substring(str.lastIndexOf("/") + 1, str.length()));
            if (str.substring(str.lastIndexOf("/") + 1, str.length()).equals("tr")) {
                str = str + "/td";
            }
            List xhtmlNodes = XPathHelper.getXhtmlNodes(this.document, str);
            LOGGER.info("--------------\n" + str + "\nS.size: " + hashSet.size() + "\n---------------");
            for (int i4 = 0; i4 < xhtmlNodes.size(); i4++) {
                String constructXPath = PageAnalyzer.constructXPath((Node) xhtmlNodes.get(i4));
                LOGGER.info(constructXPath);
                arrayList.add(constructXPath);
                hashSet.remove(constructXPath);
            }
            LOGGER.info("S.size neu: " + hashSet.size());
            LOGGER.info(hashSet.toString());
        }
        return arrayList;
    }

    public void setDocument(Document document) {
        this.document = document;
    }

    public void setDocument(String str) {
        this.document = new DocumentRetriever().getWebDocument(str);
    }

    public void setSimilarFiles(List<Document> list) {
        this.similarFiles = list;
    }

    public void setStoreLocation(String str) {
        this.storeLocation = str;
    }

    public void startPageSegmentation() {
        if (this.similarFiles == null) {
            LOGGER.info("Start findSimilarFiles------------------");
            this.similarFiles = findSimilarFiles(this.document, amountOfQGrams, lengthOfQGrams, similarityNeed, numberOfSimilarDocuments);
        }
        Node item = this.document.getElementsByTagName("body").item(0);
        ArrayList arrayList = new ArrayList();
        List<String> arrayList2 = new ArrayList<>();
        for (int i = 0; i < this.similarFiles.size(); i++) {
            LOGGER.info((i + 1) + ".Runde-----------------------------------------");
            Document document = this.similarFiles.get(i);
            List<List<String>>[] compareDocuments = compareDocuments(PageSegmenterHelper.transformNodeToDocument(item), PageSegmenterHelper.transformNodeToDocument(document.getElementsByTagName("body").item(0)), new ArrayList(), new ArrayList(), maxDepth, "/html/body");
            LOGGER.info(compareDocuments[0].size() + "-" + arrayList.size() + "=" + (compareDocuments[0].size() - arrayList.size()) + " zu " + ((arrayList.size() * 50) / DEFAULT_MAX_DEPTH));
            if (compareDocuments[0].size() - arrayList.size() < (arrayList.size() * 50) / DEFAULT_MAX_DEPTH || arrayList.size() == 0) {
                for (int i2 = 0; i2 < compareDocuments[0].size(); i2++) {
                    if (!arrayList.contains(compareDocuments[0].get(i2))) {
                        arrayList.add((String) compareDocuments[0].get(i2));
                    }
                }
                for (int i3 = 0; i3 < compareDocuments[1].size(); i3++) {
                    if (!arrayList2.contains(compareDocuments[1].get(i3))) {
                        arrayList2.add((String) compareDocuments[1].get(i3));
                    }
                }
                LOGGER.info("Size conflictNodes: " + arrayList.size());
                LOGGER.info("Size nonConflictNodes: " + arrayList2.size());
            } else {
                LOGGER.info("Zu viele neue Konflikte. Wahrscheinlich Inkompatibel.");
                this.similarFiles.remove(document);
            }
        }
        for (int i4 = 0; i4 < arrayList.size(); i4++) {
            String str = (String) arrayList.get(i4);
            for (int i5 = 0; i5 < arrayList2.size(); i5++) {
                String str2 = arrayList2.get(i5);
                if (str.contains(str2)) {
                    arrayList2.remove(str2);
                }
            }
        }
        this.segments = generateListOfSegments(this.document, SimilarityCalculator.calculateSimilarityForAllNodes(this.document, arrayList, this.similarFiles), arrayList2);
        LOGGER.info("Size conflictNodes: " + arrayList.size());
        LOGGER.info("Size nonConflictNodes: " + arrayList2.size());
    }

    public static void main(String[] strArr) throws SAXException, IOException, ParserConfigurationException, TransformerFactoryConfigurationError, TransformerException, PageContentExtractorException {
        LOGGER.info("test: " + lengthOfQGrams);
        PageSegmenter pageSegmenter = new PageSegmenter();
        LOGGER.info("test: " + lengthOfQGrams + " " + amountOfQGrams + " " + similarityNeed + " " + maxDepth + " " + numberOfSimilarDocuments);
        pageSegmenter.setDocument("http://forum.handycool.de/viewforum.php?id=20");
        pageSegmenter.startPageSegmentation();
    }
}
