package uk.bl.wap.hadoop.profiler;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpParser;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.Logger;
import org.apache.tika.Tika;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCRecord;
import uk.bl.wa.hadoop.WritableArchiveRecord;
import uk.bl.wa.nanite.droid.DroidDetector;
import uk.gov.nationalarchives.droid.command.action.CommandExecutionException;

/* loaded from: input_file:uk/bl/wap/hadoop/profiler/FormatProfilerMapper.class */
public class FormatProfilerMapper extends MapReduceBase implements Mapper<Text, WritableArchiveRecord, Text, Text> {
    private static Logger log = Logger.getLogger(FormatProfilerMapper.class.getName());
    private static final boolean droidUseBinarySignaturesOnly = false;
    private static final int BUF_SIZE = 20971520;
    static final String propertiesFile = "FormatProfiler.properties";
    static final String INCLUDE_EXTENSION = "INCLUDE_EXTENSION";
    static final String INCLUDE_SERVERTYPE = "INCLUDE_SERVERTYPE";
    static final String USE_DROID = "USE_DROID";
    static final String USE_TIKADETECT = "USE_TIKADETECT";
    static final String INCLUDE_WAYBACKYEAR = "INCLUDE_WAYBACKYEAR";
    private Map<String, Boolean> gProps = new HashMap<String, Boolean>() { // from class: uk.bl.wap.hadoop.profiler.FormatProfilerMapper.1
        private static final long serialVersionUID = -2539902167731664733L;

        {
            put(FormatProfilerMapper.INCLUDE_EXTENSION, Boolean.TRUE);
            put(FormatProfilerMapper.INCLUDE_SERVERTYPE, Boolean.TRUE);
            put(FormatProfilerMapper.USE_DROID, Boolean.TRUE);
            put(FormatProfilerMapper.USE_TIKADETECT, Boolean.TRUE);
            put(FormatProfilerMapper.INCLUDE_WAYBACKYEAR, Boolean.FALSE);
        }
    };
    private DroidDetector droidDetector = null;
    private Tika tikaDetect = null;

    private void loadConfig() {
        InputStream resourceAsStream = FormatProfilerMapper.class.getClassLoader().getResourceAsStream(propertiesFile);
        if (resourceAsStream != null) {
            Properties properties = new Properties();
            try {
                properties.load(resourceAsStream);
            } catch (IOException e) {
                e.printStackTrace();
            }
            log.info("Loaded properties from FormatProfiler.properties");
            for (Object obj : properties.keySet()) {
                if ((obj instanceof String) && this.gProps.containsKey(obj)) {
                    String str = (String) obj;
                    this.gProps.put(str, Boolean.valueOf(properties.getProperty(str, this.gProps.get(str).toString())));
                }
            }
        }
        for (String str2 : this.gProps.keySet()) {
            log.info(str2 + ": " + this.gProps.get(str2));
        }
    }

    private String getFileExt(String str) {
        String lowerCase = str.toLowerCase();
        if (str.contains(".")) {
            try {
                lowerCase = new URI(str).getPath().toLowerCase();
            } catch (URISyntaxException e) {
            }
            if (lowerCase.contains(";")) {
                lowerCase = lowerCase.substring(droidUseBinarySignaturesOnly, lowerCase.indexOf(59) + 1);
            }
            lowerCase = lowerCase.substring(lowerCase.lastIndexOf(46) + 1);
        }
        Matcher matcher = Pattern.compile("^([a-zA-Z0-9]*).*$").matcher(lowerCase);
        return matcher.find() ? matcher.group(1) : "";
    }

    private static String parseExtension(String str) {
        if (str.lastIndexOf("/") == -1) {
            return null;
        }
        String substring = str.substring(str.lastIndexOf("/"));
        if (substring.indexOf("?") != -1) {
            substring = substring.substring(droidUseBinarySignaturesOnly, substring.indexOf("?"));
        }
        if (substring.indexOf("&") != -1) {
            substring = substring.substring(droidUseBinarySignaturesOnly, substring.indexOf("&"));
        }
        if (substring.indexOf(".") == -1) {
            return null;
        }
        String replaceAll = substring.substring(substring.lastIndexOf(".")).toLowerCase().replaceAll("[^0-9a-z]", "");
        if (replaceAll.startsWith("html")) {
            replaceAll = "html";
        }
        if (replaceAll.startsWith("jpg")) {
            replaceAll = "jpg";
        }
        if (replaceAll.startsWith("jpeg")) {
            replaceAll = "jpeg";
        }
        if (replaceAll.startsWith("png")) {
            replaceAll = "png";
        }
        return replaceAll;
    }

    private String getServerType(WritableArchiveRecord writableArchiveRecord) {
        String str = "application/x-unknown";
        ARCRecord record = writableArchiveRecord.getRecord();
        if (record instanceof WARCRecord) {
            try {
                HttpParser.readLine(record, "UTF-8");
                Header[] parseHeaders = HttpParser.parseHeaders(record, "UTF-8");
                int length = parseHeaders.length;
                for (int i = droidUseBinarySignaturesOnly; i < length; i++) {
                    Header header = parseHeaders[i];
                    if ("content-type".equalsIgnoreCase(header.getName())) {
                        str = header.getValue().toLowerCase();
                    }
                }
            } catch (IOException e) {
                log.warn("IOException while processing server type: " + e);
            } catch (HttpException e2) {
                log.warn("HttpException while processing server type: " + e2);
            }
        } else {
            if (!(record instanceof ARCRecord)) {
                throw new RuntimeException("Unsupported record type!");
            }
            ARCRecord aRCRecord = record;
            ArchiveRecordHeader header2 = aRCRecord.getHeader();
            if (header2.getHeaderFields().isEmpty()) {
                log.warn("LOG: Empty header fields.");
            } else {
                str = header2.getMimetype().toLowerCase();
                if (str == null) {
                    log.warn("LOG: Server Content-Type is null.");
                }
            }
            try {
                aRCRecord.skipHttpHeader();
            } catch (IOException e3) {
                log.warn("IOException while processing server type: " + e3);
            }
        }
        return str;
    }

    private String getWaybackYear(WritableArchiveRecord writableArchiveRecord) {
        String str = "unknown";
        ArchiveRecordHeader header = writableArchiveRecord.getRecord().getHeader();
        if (header.getHeaderFields().isEmpty()) {
            log.warn("LOG: Empty header fields!");
        } else {
            String replaceAll = header.getDate().replaceAll("[^0-9]", "");
            if (replaceAll != null) {
                str = replaceAll.substring(droidUseBinarySignaturesOnly, replaceAll.length() < 4 ? replaceAll.length() : 4);
            }
        }
        return str;
    }

    private Object deserialize(byte[] bArr) {
        Object obj = droidUseBinarySignaturesOnly;
        try {
            ObjectInputStream objectInputStream = new ObjectInputStream(new ByteArrayInputStream(Base64.decodeBase64(bArr)));
            try {
                obj = objectInputStream.readObject();
            } catch (ClassNotFoundException e) {
                e.printStackTrace();
            }
            objectInputStream.close();
        } catch (IOException e2) {
            e2.printStackTrace();
        }
        return obj;
    }

    public void configure(JobConf jobConf) {
        loadConfig();
        if (this.gProps.get(USE_DROID).booleanValue()) {
            try {
                this.droidDetector = new DroidDetector();
                this.droidDetector.setBinarySignaturesOnly(false);
            } catch (CommandExecutionException e) {
                log.error("droidDetector CommandExecutionException " + e);
            }
        }
        if (this.gProps.get(USE_TIKADETECT).booleanValue()) {
            this.tikaDetect = new Tika();
        }
    }

    public void close() throws IOException {
        super.close();
    }

    public void map(Text text, WritableArchiveRecord writableArchiveRecord, OutputCollector<Text, Text> outputCollector, Reporter reporter) throws IOException {
        WARCRecord record = writableArchiveRecord.getRecord();
        if (!(record instanceof WARCRecord) || "response".equals((String) record.getHeader().getHeaderValue("WARC-Type"))) {
            String waybackYear = this.gProps.get(INCLUDE_WAYBACKYEAR).booleanValue() ? getWaybackYear(writableArchiveRecord) : droidUseBinarySignaturesOnly;
            String serverType = getServerType(writableArchiveRecord);
            log.debug("Server Type: " + serverType);
            String url = writableArchiveRecord.getRecord().getHeader().getUrl();
            BufferedInputStream bufferedInputStream = droidUseBinarySignaturesOnly;
            try {
                try {
                    String str = "";
                    if (this.gProps.get(INCLUDE_EXTENSION).booleanValue()) {
                        String str2 = "";
                        if (url != null && url.length() > 0) {
                            str2 = parseExtension(url);
                        }
                        str = "\"" + str2 + "\"";
                    }
                    if (this.gProps.get(INCLUDE_SERVERTYPE).booleanValue()) {
                        str = str + "\tSERVER:\"" + serverType + "\"";
                    }
                    if (url != null && url.length() > 0) {
                        url = URLEncoder.encode(url, "UTF-8");
                    }
                    bufferedInputStream = new BufferedInputStream(new CloseShieldInputStream(record), BUF_SIZE);
                    bufferedInputStream.mark(BUF_SIZE);
                    if (this.gProps.get(USE_DROID).booleanValue()) {
                        Metadata metadata = new Metadata();
                        metadata.set("resourceName", url);
                        log.trace("Using DroidDetector...");
                        this.droidDetector.setMaxBytesToScan(20971520L);
                        str = str + "\tDROID:\"" + this.droidDetector.detect(bufferedInputStream, metadata) + "\"";
                        bufferedInputStream.reset();
                    }
                    if (this.gProps.get(USE_TIKADETECT).booleanValue()) {
                        Metadata metadata2 = new Metadata();
                        metadata2.set("resourceName", url);
                        log.trace("Using Tika detect...");
                        str = str + "\tTIKA:\"" + this.tikaDetect.detect(bufferedInputStream, metadata2) + "\"";
                        bufferedInputStream.reset();
                    }
                    outputCollector.collect(new Text(str), new Text(waybackYear));
                    log.trace("OUTPUT " + str + " " + waybackYear);
                    if (bufferedInputStream != null) {
                        bufferedInputStream.close();
                    }
                } catch (IOException e) {
                    log.error("Failed to identify due to IOException:" + e);
                    e.printStackTrace();
                    try {
                        outputCollector.collect(new Text("IOException\t\"" + text + "\""), new Text(waybackYear));
                    } catch (IOException e2) {
                        e2.printStackTrace();
                    }
                    if (bufferedInputStream != null) {
                        bufferedInputStream.close();
                    }
                } catch (NumberFormatException e3) {
                    log.error("Potentially malformed (W)ARC file, skipping URL: (" + writableArchiveRecord.getRecord().getHeader().getUrl() + ")");
                    try {
                        outputCollector.collect(new Text("\"Malformed Record\"\t\"" + text + "\""), new Text(waybackYear));
                    } catch (IOException e4) {
                        e4.printStackTrace();
                    }
                    if (bufferedInputStream != null) {
                        bufferedInputStream.close();
                    }
                } catch (Exception e5) {
                    log.error("Exception: " + e5.getMessage() + " for record (" + writableArchiveRecord.getRecord().getHeader().getUrl() + ")");
                    e5.printStackTrace();
                    try {
                        outputCollector.collect(new Text("Exception\t\"" + text + "\""), new Text(waybackYear));
                    } catch (IOException e6) {
                        e6.printStackTrace();
                    }
                    if (bufferedInputStream != null) {
                        bufferedInputStream.close();
                    }
                }
            } catch (Throwable th) {
                if (bufferedInputStream != null) {
                    bufferedInputStream.close();
                }
                throw th;
            }
        }
    }

    public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
        map((Text) obj, (WritableArchiveRecord) obj2, (OutputCollector<Text, Text>) outputCollector, reporter);
    }
}
