package fr.pilato.elasticsearch.crawler.fs.tika;

import fr.pilato.elasticsearch.crawler.fs.settings.Fs;
import fr.pilato.elasticsearch.crawler.fs.settings.FsSettings;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.langdetect.OptimaizeLangDetector;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.SAXException;

/* loaded from: input_file:fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.class */
public class TikaInstance {
    private static final Logger logger = LogManager.getLogger(TikaInstance.class);
    private static Parser parser;
    private static ParseContext context;
    private static LanguageDetector detector;

    public static void reloadTika() {
        parser = null;
        context = null;
    }

    private static void initTika(Fs fs) {
        initParser(fs);
        initContext(fs);
    }

    private static void initParser(Fs fs) {
        DefaultParser defaultParser;
        if (parser == null) {
            Parser pDFParser = new PDFParser();
            if (fs.isPdfOcr()) {
                logger.debug("OCR is activated for PDF documents");
                if (ExternalParser.check("tesseract", new int[0])) {
                    pDFParser.setOcrStrategy("ocr_and_text");
                } else {
                    logger.debug("But Tesseract is not installed so we won't run OCR.");
                }
                defaultParser = new DefaultParser();
            } else {
                logger.debug("OCR is disabled. Even though it's detected, it must be disabled explicitly");
                defaultParser = new DefaultParser(MediaTypeRegistry.getDefaultRegistry(), new ServiceLoader(), Collections.singletonList(TesseractOCRParser.class));
            }
            parser = new AutoDetectParser(new Parser[]{defaultParser, pDFParser});
        }
    }

    private static void initContext(Fs fs) {
        if (context == null) {
            context = new ParseContext();
            context.set(Parser.class, parser);
            if (fs.isPdfOcr()) {
                logger.debug("OCR is activated");
                TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
                if (fs.getOcr().getPath() != null) {
                    tesseractOCRConfig.setTesseractPath(fs.getOcr().getPath());
                }
                if (fs.getOcr().getDataPath() != null) {
                    tesseractOCRConfig.setTessdataPath(fs.getOcr().getDataPath());
                }
                tesseractOCRConfig.setLanguage(fs.getOcr().getLanguage());
                if (fs.getOcr().getOutputType() != null) {
                    tesseractOCRConfig.setOutputType(fs.getOcr().getOutputType());
                }
                context.set(TesseractOCRConfig.class, tesseractOCRConfig);
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static String extractText(FsSettings fsSettings, int i, InputStream inputStream, Metadata metadata) throws IOException, TikaException {
        initTika(fsSettings.getFs());
        WriteOutContentHandler writeOutContentHandler = new WriteOutContentHandler(i);
        try {
            try {
                parser.parse(inputStream, new BodyContentHandler(writeOutContentHandler), metadata, context);
                inputStream.close();
            } catch (SAXException e) {
                if (!writeOutContentHandler.isWriteLimitReached(e)) {
                    throw new TikaException("Unexpected SAX processing failure", e);
                }
                inputStream.close();
            }
            return writeOutContentHandler.toString();
        } catch (Throwable th) {
            inputStream.close();
            throw th;
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static LanguageDetector langDetector() {
        if (detector == null) {
            try {
                detector = OptimaizeLangDetector.getDefaultLanguageDetector();
                detector.loadModels();
            } catch (IOException e) {
                logger.warn("Can not load lang detector models", e);
            }
        }
        return detector;
    }
}
