package org.apache.uima.ruta.engine;

import java.util.HashSet;
import java.util.Iterator;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.ruta.UIMAConstants;
import org.apache.uima.util.CasCopier;
import org.apache.uima.util.Level;
import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/* loaded from: input_file:ruta-core-2.4.0.jar:org/apache/uima/ruta/engine/HtmlConverter.class */
public class HtmlConverter extends JCasAnnotator_ImplBase {
    public static final String NAMESPACE = "org.apache.uima.ruta.type.html.";
    public static final String DEFAULT_MODIFIED_VIEW = "plaintext";
    public static final String LINEBREAK = "\n";
    public static final String PARAM_OUTPUT_VIEW = "outputView";

    @ConfigurationParameter(name = "outputView", mandatory = false, defaultValue = {DEFAULT_MODIFIED_VIEW})
    private String modifiedViewName;
    public static final String PARAM_INPUT_VIEW = "inputView";

    @ConfigurationParameter(name = "inputView", mandatory = false)
    private String inputViewName;
    public static final String PARAM_REPLACE_LINEBREAKS = "replaceLinebreaks";

    @ConfigurationParameter(name = PARAM_REPLACE_LINEBREAKS, mandatory = false, defaultValue = {"true"})
    private Boolean replaceLinebreaks;
    public static final String PARAM_SKIP_WHITESPACES = "skipWhitespaces";

    @ConfigurationParameter(name = PARAM_SKIP_WHITESPACES, mandatory = false, defaultValue = {"true"})
    private Boolean skipWhitespaces;
    public static final String PARAM_PROCESS_ALL = "processAll";

    @ConfigurationParameter(name = PARAM_PROCESS_ALL, mandatory = false, defaultValue = {"false"})
    private Boolean processAll;
    public static final String PARAM_EXPAND_OFFSETS = "expandOffsets";

    @ConfigurationParameter(name = PARAM_EXPAND_OFFSETS, mandatory = false, defaultValue = {"false"})
    private Boolean expandOffsets;
    public static final String PARAM_LINEBREAK_REPLACEMENT = "linebreakReplacement";

    @ConfigurationParameter(name = PARAM_LINEBREAK_REPLACEMENT, mandatory = false, defaultValue = {""})
    private String linebreakReplacement;
    public static final String PARAM_NEWLINE_INDUCING_TAGS = "newlineInducingTags";

    @ConfigurationParameter(name = PARAM_NEWLINE_INDUCING_TAGS, mandatory = false, defaultValue = {"br", "p", "div", "ul", "ol", "dl", "li", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote"})
    private String[] newlineInducingTags;
    public static final String PARAM_NEWLINE_INDUCING_TAG_REGEXP = "newlineInducingTagRegExp";

    @ConfigurationParameter(name = PARAM_NEWLINE_INDUCING_TAG_REGEXP, mandatory = false)
    private String newlineInducingTagRegExp;
    public static final String PARAM_GAP_INDUCING_TAGS = "gapInducingTags";

    @ConfigurationParameter(name = PARAM_GAP_INDUCING_TAGS, mandatory = false)
    private String[] gapInducingTags;
    public static final String PARAM_GAP_TEXT = "gapText";

    @ConfigurationParameter(name = PARAM_GAP_TEXT, mandatory = false, defaultValue = {""})
    private String gapText;
    public static final String PARAM_USE_SPACE_GAP = "useSpaceGap";

    @ConfigurationParameter(name = PARAM_USE_SPACE_GAP, mandatory = false, defaultValue = {""})
    private Boolean useSpaceGap;
    public static final String PARAM_CONVERSION_PATTERNS = "conversionPatterns";

    @ConfigurationParameter(name = PARAM_CONVERSION_PATTERNS, mandatory = false, defaultValue = {"&nbsp;", "&laquo;", "&raquo;", "&quot;", "&amp;", "&lt;", "&gt;", "&apos;", "&sect;", "&uml;", "&copy;", "&trade;", "&reg;", "&ouml;", "&auml;", "&uuml;", "&#160;"})
    private String[] conversionPatterns;
    public static final String PARAM_CONVERSION_POLICY = "conversionPolicy";

    @ConfigurationParameter(name = PARAM_CONVERSION_POLICY, mandatory = false, defaultValue = {"heuristic"})
    private String conversionPolicy;
    public static final String PARAM_CONVERSION_REPLACEMENTS = "conversionReplacements";

    @ConfigurationParameter(name = PARAM_CONVERSION_REPLACEMENTS, mandatory = false)
    private String[] conversionReplacements;
    private int[] map;

    @Override // org.apache.uima.fit.component.JCasAnnotator_ImplBase
    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        super.initialize(uimaContext);
        this.inputViewName = (String) uimaContext.getConfigParameterValue("inputView");
        this.inputViewName = StringUtils.isBlank(this.inputViewName) ? null : this.inputViewName;
        this.modifiedViewName = (String) uimaContext.getConfigParameterValue("outputView");
        this.modifiedViewName = StringUtils.isBlank(this.modifiedViewName) ? DEFAULT_MODIFIED_VIEW : this.modifiedViewName;
        this.replaceLinebreaks = (Boolean) uimaContext.getConfigParameterValue(PARAM_REPLACE_LINEBREAKS);
        this.replaceLinebreaks = Boolean.valueOf(this.replaceLinebreaks == null ? true : this.replaceLinebreaks.booleanValue());
        this.skipWhitespaces = (Boolean) uimaContext.getConfigParameterValue(PARAM_SKIP_WHITESPACES);
        this.skipWhitespaces = Boolean.valueOf(this.skipWhitespaces == null ? true : this.skipWhitespaces.booleanValue());
        this.processAll = (Boolean) uimaContext.getConfigParameterValue(PARAM_PROCESS_ALL);
        this.processAll = Boolean.valueOf(this.processAll == null ? true : this.processAll.booleanValue());
        this.linebreakReplacement = (String) uimaContext.getConfigParameterValue(PARAM_LINEBREAK_REPLACEMENT);
        this.linebreakReplacement = this.linebreakReplacement == null ? "" : this.linebreakReplacement;
        String str = (String) uimaContext.getConfigParameterValue(PARAM_CONVERSION_POLICY);
        if (!StringUtils.isBlank(str) && !str.equals("heuristic")) {
            if (!str.equals("explicit") && !str.equals("none")) {
                throw new ResourceInitializationException("illegal conversionPolicy parameter value", new Object[0]);
            }
        }
        if (((String[]) uimaContext.getConfigParameterValue(PARAM_NEWLINE_INDUCING_TAGS)) == null) {
            this.newlineInducingTags = new String[]{"br", "p", "div", "ul", "ol", "dl", "li", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote"};
        }
        if (this.modifiedViewName.equals(this.inputViewName)) {
            throw new ResourceInitializationException("input and output view names must differ!", new Object[0]);
        }
        this.conversionPatterns = (String[]) uimaContext.getConfigParameterValue(PARAM_CONVERSION_PATTERNS);
        if (this.conversionPatterns == null) {
            this.conversionPatterns = new String[]{"&nbsp;", "&laquo;", "&raquo;", "&quot;", "&amp;", "&lt;", "&gt;", "&apos;", "&sect;", "&uml;", "&copy;", "&trade;", "&reg;", "&ouml;", "&auml;", "&uuml;", "&#160;"};
        }
        this.conversionReplacements = (String[]) uimaContext.getConfigParameterValue(PARAM_CONVERSION_REPLACEMENTS);
        if (this.conversionReplacements == null) {
            this.conversionReplacements = new String[this.conversionPatterns.length];
            for (int i = 0; i < this.conversionPatterns.length; i++) {
                this.conversionReplacements[i] = StringEscapeUtils.unescapeHtml4(this.conversionPatterns[i]);
            }
        }
        this.gapText = (String) uimaContext.getConfigParameterValue(PARAM_GAP_TEXT);
        this.gapText = this.gapText == null ? "" : this.gapText;
        this.useSpaceGap = (Boolean) uimaContext.getConfigParameterValue(PARAM_USE_SPACE_GAP);
        this.useSpaceGap = Boolean.valueOf(this.useSpaceGap == null ? false : this.useSpaceGap.booleanValue());
        if (this.useSpaceGap.booleanValue()) {
            this.gapText = " ";
        }
        this.gapInducingTags = (String[]) uimaContext.getConfigParameterValue(PARAM_GAP_INDUCING_TAGS);
        this.gapInducingTags = this.gapInducingTags == null ? new String[0] : this.gapInducingTags;
        this.expandOffsets = (Boolean) uimaContext.getConfigParameterValue(PARAM_EXPAND_OFFSETS);
        this.expandOffsets = Boolean.valueOf(this.expandOffsets == null ? false : this.expandOffsets.booleanValue());
        this.newlineInducingTagRegExp = (String) uimaContext.getConfigParameterValue(PARAM_NEWLINE_INDUCING_TAG_REGEXP);
    }

    public void process(JCas jCas) throws AnalysisEngineProcessException {
        String substring;
        try {
            JCas view = this.inputViewName != null ? jCas.getView(this.inputViewName) : jCas;
            String documentText = view.getDocumentText();
            String str = documentText.contains(IOUtils.LINE_SEPARATOR_WINDOWS) ? IOUtils.LINE_SEPARATOR_WINDOWS : "\n";
            this.map = new int[documentText.length() + 1];
            JCas jCas2 = null;
            try {
                Iterator viewIterator = view.getViewIterator();
                while (viewIterator.hasNext()) {
                    JCas jCas3 = (JCas) viewIterator.next();
                    if (jCas3.getViewName().equals(this.modifiedViewName)) {
                        jCas2 = jCas3;
                        getContext().getLogger().log(Level.WARNING, "view with name \"" + this.modifiedViewName + "\" already exists.");
                    }
                }
                if (jCas2 == null) {
                    jCas2 = view.createView(this.modifiedViewName);
                }
                new TreeSet();
                new TreeSet();
                new TreeSet();
                try {
                    NodeList parse = new Parser(documentText).parse(null);
                    HtmlConverterVisitor htmlConverterVisitor = new HtmlConverterVisitor(this.newlineInducingTags, this.newlineInducingTagRegExp, this.gapInducingTags, this.gapText, this.skipWhitespaces.booleanValue(), this.processAll.booleanValue());
                    parse.visitAllNodesWith(htmlConverterVisitor);
                    SortedSet<HtmlConverterPSpan> textSpans = htmlConverterVisitor.getTextSpans();
                    SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = htmlConverterVisitor.getLinebreaksFromHtmlTags();
                    SortedSet<HtmlConverterPSpan> gapsFromHtmlTags = htmlConverterVisitor.getGapsFromHtmlTags();
                    if (this.replaceLinebreaks.booleanValue()) {
                        textSpans = handleLinebreaksInDocumentText(textSpans, str);
                    }
                    if (this.conversionPolicy.equals("heuristic")) {
                        textSpans = htmlDecoding(textSpans);
                    } else if (this.conversionPolicy.equals("explicit")) {
                        for (int i = 0; i < this.conversionPatterns.length; i++) {
                            textSpans = handleConversion(textSpans, this.conversionPatterns[i], this.conversionReplacements[i]);
                        }
                    }
                    textSpans.addAll(linebreaksFromHtmlTags);
                    textSpans.addAll(gapsFromHtmlTags);
                    StringBuffer stringBuffer = new StringBuffer(documentText.length());
                    int i2 = 0;
                    int i3 = 0;
                    for (HtmlConverterPSpan htmlConverterPSpan : textSpans) {
                        int begin = htmlConverterPSpan.getBegin();
                        int end = htmlConverterPSpan.getEnd();
                        while (i2 < begin) {
                            int i4 = i2;
                            i2++;
                            this.map[i4] = i3;
                        }
                        if (htmlConverterPSpan instanceof HtmlConverterPSpanReplacement) {
                            substring = htmlConverterPSpan.getTxt();
                            while (i2 < begin + substring.length()) {
                                int i5 = i2;
                                i2++;
                                int i6 = i3;
                                i3++;
                                this.map[i5] = i6;
                            }
                            while (i2 < end) {
                                int i7 = i2;
                                i2++;
                                this.map[i7] = i3;
                            }
                        } else {
                            substring = documentText.substring(begin, end);
                            while (i2 < end) {
                                int i8 = i2;
                                i2++;
                                int i9 = i3;
                                i3++;
                                this.map[i8] = i9;
                            }
                        }
                        stringBuffer.append(substring);
                    }
                    while (i2 < documentText.length()) {
                        int i10 = i2;
                        i2++;
                        this.map[i10] = i3;
                    }
                    this.map[documentText.length()] = i3 + 1;
                    jCas2.setDocumentText(stringBuffer.toString());
                    try {
                        mapAnnotations(view, this.map, this.modifiedViewName);
                    } catch (CASException e) {
                        e.printStackTrace();
                    }
                } catch (ParserException e2) {
                    throw new AnalysisEngineProcessException(e2);
                }
            } catch (CASException e3) {
                e3.printStackTrace();
            }
        } catch (CASException e4) {
            throw new AnalysisEngineProcessException(e4.getCause());
        }
    }

    private void mapAnnotations(JCas jCas, int[] iArr, String str) throws CASException {
        JCas view = jCas.getView(str);
        HashSet hashSet = new HashSet();
        HashSet<Annotation> hashSet2 = new HashSet();
        AnnotationIndex<Annotation> annotationIndex = jCas.getAnnotationIndex();
        TypeSystem typeSystem = jCas.getTypeSystem();
        Type type = typeSystem.getType(UIMAConstants.TYPE_DOCUMENT);
        CasCopier casCopier = new CasCopier(jCas.getCas(), view.getCas());
        for (Annotation annotation : annotationIndex) {
            if (!typeSystem.subsumes(type, annotation.getType())) {
                Annotation copyFs = casCopier.copyFs(annotation);
                copyFs.setFeatureValue(view.getTypeSystem().getFeatureByFullName("uima.tcas.Annotation:sofa"), view.getSofa());
                int i = iArr[copyFs.getBegin()];
                int i2 = iArr[copyFs.getEnd()];
                if (i < i2) {
                    if (i2 > jCas.getCas().getDocumentAnnotation().getEnd()) {
                        getContext().getLogger().log(Level.WARNING, "illegal annotation offset mapping");
                    } else {
                        int end = view.getCas().getDocumentAnnotation().getEnd();
                        if (i >= end || i2 > end || i < 0 || i2 <= 0) {
                            getContext().getLogger().log(Level.WARNING, "illegal annotation offset mapping");
                        } else {
                            copyFs.setBegin(i);
                            copyFs.setEnd(i2);
                            view.addFsToIndexes(copyFs);
                            hashSet.add(copyFs);
                        }
                    }
                } else if (this.expandOffsets.booleanValue()) {
                    copyFs.setBegin(i);
                    copyFs.setEnd(i2);
                    hashSet2.add(copyFs);
                }
            }
        }
        for (Annotation annotation2 : hashSet2) {
            Annotation nextBestAnnotation = getNextBestAnnotation(annotation2, view);
            if (nextBestAnnotation != null) {
                annotation2.setBegin(nextBestAnnotation.getBegin());
                annotation2.setEnd(nextBestAnnotation.getEnd());
                Feature featureByBaseName = annotation2.getType().getFeatureByBaseName("expandedOffsets");
                if (featureByBaseName != null) {
                    annotation2.setBooleanValue(featureByBaseName, true);
                }
                view.addFsToIndexes(annotation2);
            }
        }
    }

    private Annotation getNextBestAnnotation(Annotation annotation, JCas jCas) {
        FSIterator it = jCas.getAnnotationIndex().iterator(annotation);
        Annotation annotation2 = null;
        if (it.isValid()) {
            annotation2 = (Annotation) it.get();
        } else {
            FSIterator it2 = jCas.getAnnotationIndex().iterator(new Annotation(jCas, annotation.getBegin(), annotation.getBegin() + 1));
            if (!it2.isValid()) {
                if (jCas.getDocumentText().length() / 2 > annotation.getBegin()) {
                    it2.moveToFirst();
                    if (it2.isValid()) {
                        annotation2 = (Annotation) it2.get();
                    }
                } else {
                    it2.moveToLast();
                    if (it2.isValid()) {
                        annotation2 = it2.get();
                    }
                }
            }
        }
        return annotation2;
    }

    private SortedSet<HtmlConverterPSpan> handleLinebreaksInDocumentText(SortedSet<HtmlConverterPSpan> sortedSet, String str) {
        return handleConversion(sortedSet, str, this.linebreakReplacement);
    }

    private SortedSet<HtmlConverterPSpan> htmlDecoding(SortedSet<HtmlConverterPSpan> sortedSet) {
        TreeSet treeSet = new TreeSet((SortedSet) sortedSet);
        Pattern compile = Pattern.compile("(&[a-zA-Z0-9]{2,6};)|(&#\\d{2,5};)");
        for (HtmlConverterPSpan htmlConverterPSpan : sortedSet) {
            String txt = htmlConverterPSpan.getTxt();
            Matcher matcher = compile.matcher(txt);
            if (matcher.find()) {
                treeSet.remove(htmlConverterPSpan);
                int begin = htmlConverterPSpan.getBegin();
                int begin2 = htmlConverterPSpan.getBegin();
                do {
                    String group = matcher.group();
                    treeSet.add(new HtmlConverterPSpanReplacement(begin + matcher.start(), begin + matcher.end(), StringEscapeUtils.unescapeHtml4(group)));
                    int length = group.length();
                    if (begin + matcher.end() > begin2 + length) {
                        int i = begin2;
                        int start = begin + matcher.start();
                        treeSet.add(new HtmlConverterPSpan(i, start, txt.substring(i - begin, start - begin)));
                        begin2 = start;
                    }
                    begin2 += length;
                } while (matcher.find());
                if (begin2 < htmlConverterPSpan.getEnd()) {
                    treeSet.add(new HtmlConverterPSpan(begin2, htmlConverterPSpan.getEnd(), txt.substring(begin2 - begin, htmlConverterPSpan.getEnd() - begin)));
                }
            }
        }
        return treeSet;
    }

    private SortedSet<HtmlConverterPSpan> handleConversion(SortedSet<HtmlConverterPSpan> sortedSet, String str, String str2) {
        TreeSet treeSet = new TreeSet((SortedSet) sortedSet);
        Pattern compile = Pattern.compile(str);
        int length = str.length();
        for (HtmlConverterPSpan htmlConverterPSpan : sortedSet) {
            String txt = htmlConverterPSpan.getTxt();
            Matcher matcher = compile.matcher(txt);
            if (matcher.find()) {
                treeSet.remove(htmlConverterPSpan);
                int begin = htmlConverterPSpan.getBegin();
                int begin2 = htmlConverterPSpan.getBegin();
                do {
                    if (!StringUtils.isEmpty(str2)) {
                        treeSet.add(new HtmlConverterPSpanReplacement(begin + matcher.start(), begin + matcher.end(), str2));
                    }
                    if (begin + matcher.end() > begin2 + length) {
                        int i = begin2;
                        int start = begin + matcher.start();
                        treeSet.add(new HtmlConverterPSpan(i, start, txt.substring(i - begin, start - begin)));
                        begin2 = start;
                    }
                    begin2 += length;
                } while (matcher.find());
                if (begin2 < htmlConverterPSpan.getEnd()) {
                    treeSet.add(new HtmlConverterPSpan(begin2, htmlConverterPSpan.getEnd(), txt.substring(begin2 - begin, htmlConverterPSpan.getEnd() - begin)));
                }
            }
        }
        return treeSet;
    }
}
