package com.yahoo.vespa.hosted.provision.maintenance;

import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.TransientException;
import com.yahoo.jdisc.Metric;
import com.yahoo.log.LogLevel;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
import com.yahoo.vespa.orchestrator.HostNameNotFoundException;
import com.yahoo.vespa.orchestrator.Orchestrator;
import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
import com.yahoo.vespa.orchestrator.status.HostStatus;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
import com.yahoo.yolean.Exceptions;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.time.temporal.TemporalAmount;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;

/* loaded from: input_file:com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.class */
public class NodeFailer extends Maintainer {
    private static final Logger log = Logger.getLogger(NodeFailer.class.getName());
    private static final Duration nodeRequestInterval = Duration.ofMinutes(10);
    static final String throttledNodeFailuresMetric = "throttledNodeFailures";
    static final String throttlingActiveMetric = "nodeFailThrottling";
    private final HostLivenessTracker hostLivenessTracker;
    private final ServiceMonitor serviceMonitor;
    private final Deployer deployer;
    private final Duration downTimeLimit;
    private final Clock clock;
    private final Orchestrator orchestrator;
    private final Instant constructionTime;
    private final ThrottlePolicy throttlePolicy;
    private final Metric metric;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* renamed from: com.yahoo.vespa.hosted.provision.maintenance.NodeFailer$1, reason: invalid class name */
    /* loaded from: input_file:com/yahoo/vespa/hosted/provision/maintenance/NodeFailer$1.class */
    public static /* synthetic */ class AnonymousClass1 {
        static final /* synthetic */ int[] $SwitchMap$com$yahoo$config$provision$NodeType = new int[NodeType.values().length];

        static {
            try {
                $SwitchMap$com$yahoo$config$provision$NodeType[NodeType.tenant.ordinal()] = 1;
            } catch (NoSuchFieldError e) {
            }
            try {
                $SwitchMap$com$yahoo$config$provision$NodeType[NodeType.host.ordinal()] = 2;
            } catch (NoSuchFieldError e2) {
            }
            try {
                $SwitchMap$com$yahoo$config$provision$NodeType[NodeType.proxy.ordinal()] = 3;
            } catch (NoSuchFieldError e3) {
            }
            try {
                $SwitchMap$com$yahoo$config$provision$NodeType[NodeType.proxyhost.ordinal()] = 4;
            } catch (NoSuchFieldError e4) {
            }
        }
    }

    /* loaded from: input_file:com/yahoo/vespa/hosted/provision/maintenance/NodeFailer$ThrottlePolicy.class */
    public enum ThrottlePolicy {
        hosted(Duration.ofDays(1), 0.02d, 2),
        disabled(Duration.ZERO, 0.0d, 0);

        private final Duration throttleWindow;
        private final double fractionAllowedToFail;
        private final int minimumAllowedToFail;

        ThrottlePolicy(Duration duration, double d, int i) {
            this.throttleWindow = duration;
            this.fractionAllowedToFail = d;
            this.minimumAllowedToFail = i;
        }

        public int allowedToFailOf(int i) {
            return (int) Math.max(i * this.fractionAllowedToFail, this.minimumAllowedToFail);
        }

        public String toHumanReadableString(int i) {
            return String.format("Max %.0f%% (%d) or %d nodes can fail over a period of %s", Double.valueOf(this.fractionAllowedToFail * 100.0d), Integer.valueOf(allowedToFailOf(i)), Integer.valueOf(this.minimumAllowedToFail), this.throttleWindow);
        }
    }

    public NodeFailer(Deployer deployer, HostLivenessTracker hostLivenessTracker, ServiceMonitor serviceMonitor, NodeRepository nodeRepository, Duration duration, Clock clock, Orchestrator orchestrator, ThrottlePolicy throttlePolicy, Metric metric) {
        super(nodeRepository, min(duration.dividedBy(2L), Duration.ofMinutes(5L)));
        this.deployer = deployer;
        this.hostLivenessTracker = hostLivenessTracker;
        this.serviceMonitor = serviceMonitor;
        this.downTimeLimit = duration;
        this.clock = clock;
        this.orchestrator = orchestrator;
        this.constructionTime = clock.instant();
        this.throttlePolicy = throttlePolicy;
        this.metric = metric;
    }

    @Override // com.yahoo.vespa.hosted.provision.maintenance.Maintainer
    protected void maintain() {
        int i = 0;
        Mutex lockAllocation = nodeRepository().lockAllocation();
        try {
            updateNodeLivenessEventsForReadyNodes(lockAllocation);
            for (Map.Entry<Node, String> entry : getReadyNodesByFailureReason().entrySet()) {
                Node key = entry.getKey();
                if (throttle(key)) {
                    i++;
                } else {
                    nodeRepository().fail(key.hostname(), Agent.system, entry.getValue());
                }
            }
            if (lockAllocation != null) {
                lockAllocation.close();
            }
            updateNodeDownState();
            for (Map.Entry<Node, String> entry2 : getActiveNodesByFailureReason(nodeRepository().getNodes(Node.State.active)).entrySet()) {
                Node key2 = entry2.getKey();
                if (failAllowedFor(key2.type())) {
                    if (throttle(key2)) {
                        i++;
                    } else {
                        failActive(key2, entry2.getValue());
                    }
                }
            }
            this.metric.set(throttlingActiveMetric, Integer.valueOf(Math.min(1, i)), (Metric.Context) null);
            this.metric.set(throttledNodeFailuresMetric, Integer.valueOf(i), (Metric.Context) null);
        } catch (Throwable th) {
            if (lockAllocation != null) {
                try {
                    lockAllocation.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    private void updateNodeLivenessEventsForReadyNodes(Mutex mutex) {
        for (Node node : nodeRepository().getNodes(Node.State.ready)) {
            Optional lastRequestFrom = this.hostLivenessTracker.lastRequestFrom(node.hostname());
            if (lastRequestFrom.isPresent() && !node.history().hasEventAfter(History.Event.Type.requested, (Instant) lastRequestFrom.get())) {
                nodeRepository().write(node.with(node.history().with(new History.Event(History.Event.Type.requested, Agent.system, (Instant) lastRequestFrom.get()))), mutex);
            }
        }
    }

    private Map<Node, String> getReadyNodesByFailureReason() {
        Instant minus = this.constructionTime.isAfter(this.clock.instant().minus((TemporalAmount) nodeRequestInterval.multipliedBy(2L))) ? Instant.EPOCH : this.clock.instant().minus((TemporalAmount) this.downTimeLimit).minus((TemporalAmount) nodeRequestInterval);
        HashMap hashMap = new HashMap();
        for (Node node : nodeRepository().getNodes(Node.State.ready)) {
            if (expectConfigRequests(node) && !hasNodeRequestedConfigAfter(node, minus)) {
                hashMap.put(node, "Not receiving config requests from node");
            } else if (node.status().hardwareFailureDescription().isPresent()) {
                hashMap.put(node, "Node has hardware failure");
            } else if (node.status().hardwareDivergence().isPresent()) {
                hashMap.put(node, "Node has hardware divergence");
            } else {
                Node node2 = (Node) node.parentHostname().flatMap(str -> {
                    return nodeRepository().getNode(str, new Node.State[0]);
                }).orElse(node);
                List<String> reasonsToFailParentHost = reasonsToFailParentHost(node2);
                if (reasonsToFailParentHost.size() > 0) {
                    if (node2.equals(node)) {
                        hashMap.put(node, "Host has failure reports: " + reasonsToFailParentHost);
                    } else {
                        hashMap.put(node, "Parent (" + node2 + ") has failure reports: " + reasonsToFailParentHost);
                    }
                }
            }
        }
        return hashMap;
    }

    private void updateNodeDownState() {
        Map map = (Map) nodeRepository().getNodes(Node.State.active).stream().collect(Collectors.toMap((v0) -> {
            return v0.hostname();
        }, node -> {
            return node;
        }));
        this.serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostName, list) -> {
            Node node2 = (Node) map.get(hostName.s());
            if (node2 == null) {
                return;
            }
            if (badNode(list)) {
                recordAsDown(node2);
            } else {
                clearDownRecord(node2);
            }
        });
    }

    private Map<Node, String> getActiveNodesByFailureReason(List<Node> list) {
        Instant minus = this.clock.instant().minus((TemporalAmount) this.downTimeLimit);
        HashMap hashMap = new HashMap();
        for (Node node : list) {
            if (node.history().hasEventBefore(History.Event.Type.down, minus) && !applicationSuspended(node)) {
                hashMap.put(node, "Node has been down longer than " + this.downTimeLimit);
            } else if (hostSuspended(node, list)) {
                if (node.status().hardwareFailureDescription().isPresent()) {
                    hashMap.put(node, "Node has hardware failure: " + node.status().hardwareFailureDescription().get());
                } else {
                    Node node2 = (Node) node.parentHostname().flatMap(str -> {
                        return nodeRepository().getNode(str, new Node.State[0]);
                    }).orElse(node);
                    if (node2.type().isDockerHost()) {
                        List<String> reasonsToFailParentHost = reasonsToFailParentHost(node2);
                        if (reasonsToFailParentHost.size() > 0) {
                            if (node2.equals(node)) {
                                hashMap.put(node, "Host has failure reports: " + reasonsToFailParentHost);
                            } else {
                                hashMap.put(node, "Parent (" + node2 + ") has failure reports: " + reasonsToFailParentHost);
                            }
                        }
                    }
                }
            }
        }
        return hashMap;
    }

    private static List<String> reasonsToFailParentHost(Node node) {
        return (List) node.reports().getReports().stream().filter(report -> {
            return report.getType().hostShouldBeFailed();
        }).map(report2 -> {
            return report2.getReportId() + " reported " + report2.getCreatedTime() + ": " + report2.getDescription();
        }).collect(Collectors.toList());
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) {
        return node.status().hardwareFailureDescription().isPresent() || node.status().hardwareDivergence().isPresent() || reasonsToFailParentHost((Node) node.parentHostname().flatMap(str -> {
            return nodeRepository.getNode(str, new Node.State[0]);
        }).orElse(node)).size() > 0;
    }

    private boolean expectConfigRequests(Node node) {
        return !node.type().isDockerHost();
    }

    private boolean hasNodeRequestedConfigAfter(Node node, Instant instant) {
        return !wasMadeReadyBefore(node, instant) || hasRecordedRequestAfter(node, instant);
    }

    private boolean wasMadeReadyBefore(Node node, Instant instant) {
        return node.history().hasEventBefore(History.Event.Type.readied, instant);
    }

    private boolean hasRecordedRequestAfter(Node node, Instant instant) {
        return node.history().hasEventAfter(History.Event.Type.requested, instant);
    }

    private boolean applicationSuspended(Node node) {
        try {
            return this.orchestrator.getApplicationInstanceStatus(node.allocation().get().owner()) == ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN;
        } catch (ApplicationIdNotFoundException e) {
            return false;
        }
    }

    private boolean nodeSuspended(Node node) {
        try {
            return this.orchestrator.getNodeStatus(new HostName(node.hostname())) == HostStatus.ALLOWED_TO_BE_DOWN;
        } catch (HostNameNotFoundException e) {
            return false;
        }
    }

    private boolean hostSuspended(Node node, List<Node> list) {
        if (!nodeSuspended(node)) {
            return false;
        }
        if (node.parentHostname().isPresent()) {
            return true;
        }
        return list.stream().filter(node2 -> {
            return node2.parentHostname().isPresent() && node2.parentHostname().get().equals(node.hostname());
        }).allMatch(this::nodeSuspended);
    }

    private boolean failAllowedFor(NodeType nodeType) {
        switch (AnonymousClass1.$SwitchMap$com$yahoo$config$provision$NodeType[nodeType.ordinal()]) {
            case 1:
            case 2:
                return true;
            case 3:
            case 4:
                return nodeRepository().getNodes(nodeType, Node.State.failed).size() == 0;
            default:
                return false;
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static boolean badNode(List<ServiceInstance> list) {
        Map map = (Map) list.stream().collect(Collectors.groupingBy((v0) -> {
            return v0.serviceStatus();
        }, Collectors.counting()));
        return ((Long) map.getOrDefault(ServiceStatus.UP, 0L)).longValue() <= 0 && ((Long) map.getOrDefault(ServiceStatus.DOWN, 0L)).longValue() > 0;
    }

    private Node recordAsDown(Node node) {
        if (node.history().event(History.Event.Type.down).isPresent()) {
            return node;
        }
        Mutex lock = nodeRepository().lock(node.allocation().get().owner());
        try {
            Node write = nodeRepository().write(nodeRepository().getNode(node.hostname(), Node.State.active).get().downAt(this.clock.instant()), lock);
            if (lock != null) {
                lock.close();
            }
            return write;
        } catch (Throwable th) {
            if (lock != null) {
                try {
                    lock.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    private void clearDownRecord(Node node) {
        if (node.history().event(History.Event.Type.down).isPresent()) {
            Mutex lock = nodeRepository().lock(node.allocation().get().owner());
            try {
                nodeRepository().write(nodeRepository().getNode(node.hostname(), Node.State.active).get().up(), lock);
                if (lock != null) {
                    lock.close();
                }
            } catch (Throwable th) {
                if (lock != null) {
                    try {
                        lock.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                }
                throw th;
            }
        }
    }

    private boolean failActive(Node node, String str) {
        Optional deployFromLocalActive = this.deployer.deployFromLocalActive(node.allocation().get().owner(), Duration.ofMinutes(30L));
        if (!deployFromLocalActive.isPresent()) {
            return false;
        }
        Mutex lock = nodeRepository().lock(node.allocation().get().owner());
        try {
            boolean z = true;
            String str2 = "Failing due to parent host " + node.hostname() + " failure: " + str;
            Iterator<Node> it = nodeRepository().list().childrenOf(node).iterator();
            while (it.hasNext()) {
                Node next = it.next();
                if (next.state() == Node.State.active) {
                    z &= failActive(next, str2);
                } else {
                    nodeRepository().fail(next.hostname(), Agent.system, str2);
                }
            }
            if (!z) {
                if (lock != null) {
                    lock.close();
                }
                return false;
            }
            Node fail = nodeRepository().fail(node.hostname(), Agent.system, str);
            try {
                ((Deployment) deployFromLocalActive.get()).activate();
                if (lock != null) {
                    lock.close();
                }
                return true;
            } catch (TransientException e) {
                log.log(LogLevel.INFO, "Failed to redeploy " + fail.allocation().get().owner() + " with a transient error, will be retried by application maintainer: " + Exceptions.toMessageString(e));
                if (lock != null) {
                    lock.close();
                }
                return true;
            } catch (RuntimeException e2) {
                nodeRepository().reactivate(fail.hostname(), Agent.system, "Failed to redeploy after being failed by NodeFailer");
                log.log(Level.WARNING, "Attempted to fail " + fail + " for " + fail.allocation().get().owner() + ", but redeploying without the node failed", (Throwable) e2);
                if (lock != null) {
                    lock.close();
                }
                return false;
            }
        } catch (Throwable th) {
            if (lock != null) {
                try {
                    lock.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    private boolean throttle(Node node) {
        if (this.throttlePolicy == ThrottlePolicy.disabled) {
            return false;
        }
        Instant minus = this.clock.instant().minus((TemporalAmount) this.throttlePolicy.throttleWindow);
        List<Node> nodes = nodeRepository().getNodes(new Node.State[0]);
        NodeList nodeList = (NodeList) nodes.stream().filter(node2 -> {
            return node2.history().hasEventAfter(History.Event.Type.failed, minus);
        }).collect(Collectors.collectingAndThen(Collectors.toList(), NodeList::new));
        if (nodeList.size() < this.throttlePolicy.allowedToFailOf(nodes.size())) {
            return false;
        }
        if (!node.parentHostname().isPresent() && nodeList.parents().size() < this.throttlePolicy.minimumAllowedToFail) {
            return false;
        }
        log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(), this.throttlePolicy.toHumanReadableString(nodes.size())));
        return true;
    }
}
